LLVM 13.0.0
AArch64ISelLowering.cpp
Go to the documentation of this file.
1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the AArch64TargetLowering class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64ISelLowering.h"
15#include "AArch64ExpandImm.h"
18#include "AArch64RegisterInfo.h"
19#include "AArch64Subtarget.h"
22#include "llvm/ADT/APFloat.h"
23#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/ArrayRef.h"
25#include "llvm/ADT/STLExtras.h"
26#include "llvm/ADT/SmallSet.h"
28#include "llvm/ADT/Statistic.h"
29#include "llvm/ADT/StringRef.h"
30#include "llvm/ADT/Triple.h"
31#include "llvm/ADT/Twine.h"
49#include "llvm/IR/Attributes.h"
50#include "llvm/IR/Constants.h"
51#include "llvm/IR/DataLayout.h"
52#include "llvm/IR/DebugLoc.h"
54#include "llvm/IR/Function.h"
56#include "llvm/IR/GlobalValue.h"
57#include "llvm/IR/IRBuilder.h"
58#include "llvm/IR/Instruction.h"
61#include "llvm/IR/Intrinsics.h"
62#include "llvm/IR/IntrinsicsAArch64.h"
63#include "llvm/IR/Module.h"
66#include "llvm/IR/Type.h"
67#include "llvm/IR/Use.h"
68#include "llvm/IR/Value.h"
74#include "llvm/Support/Debug.h"
82#include <algorithm>
83#include <bitset>
84#include <cassert>
85#include <cctype>
86#include <cstdint>
87#include <cstdlib>
88#include <iterator>
89#include <limits>
90#include <tuple>
91#include <utility>
92#include <vector>
93
94using namespace llvm;
95using namespace llvm::PatternMatch;
96
97#define DEBUG_TYPE "aarch64-lower"
98
99STATISTIC(NumTailCalls, "Number of tail calls");
100STATISTIC(NumShiftInserts, "Number of vector shift inserts");
101STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
102
103// FIXME: The necessary dtprel relocations don't seem to be supported
104// well in the GNU bfd and gold linkers at the moment. Therefore, by
105// default, for now, fall back to GeneralDynamic code generation.
107 "aarch64-elf-ldtls-generation", cl::Hidden,
108 cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
109 cl::init(false));
110
111static cl::opt<bool>
112EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
113 cl::desc("Enable AArch64 logical imm instruction "
114 "optimization"),
115 cl::init(true));
116
117// Temporary option added for the purpose of testing functionality added
118// to DAGCombiner.cpp in D92230. It is expected that this can be removed
119// in future when both implementations will be based off MGATHER rather
120// than the GLD1 nodes added for the SVE gather load intrinsics.
121static cl::opt<bool>
122EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
123 cl::desc("Combine extends of AArch64 masked "
124 "gather intrinsics"),
125 cl::init(true));
126
127/// Value type used for condition codes.
128static const MVT MVT_CC = MVT::i32;
129
130static inline EVT getPackedSVEVectorVT(EVT VT) {
131 switch (VT.getSimpleVT().SimpleTy) {
132 default:
133 llvm_unreachable("unexpected element type for vector");
134 case MVT::i8:
135 return MVT::nxv16i8;
136 case MVT::i16:
137 return MVT::nxv8i16;
138 case MVT::i32:
139 return MVT::nxv4i32;
140 case MVT::i64:
141 return MVT::nxv2i64;
142 case MVT::f16:
143 return MVT::nxv8f16;
144 case MVT::f32:
145 return MVT::nxv4f32;
146 case MVT::f64:
147 return MVT::nxv2f64;
148 case MVT::bf16:
149 return MVT::nxv8bf16;
150 }
151}
152
153// NOTE: Currently there's only a need to return integer vector types. If this
154// changes then just add an extra "type" parameter.
156 switch (EC.getKnownMinValue()) {
157 default:
158 llvm_unreachable("unexpected element count for vector");
159 case 16:
160 return MVT::nxv16i8;
161 case 8:
162 return MVT::nxv8i16;
163 case 4:
164 return MVT::nxv4i32;
165 case 2:
166 return MVT::nxv2i64;
167 }
168}
169
172 "Expected scalable predicate vector type!");
173 switch (VT.getVectorMinNumElements()) {
174 default:
175 llvm_unreachable("unexpected element count for vector");
176 case 2:
177 return MVT::nxv2i64;
178 case 4:
179 return MVT::nxv4i32;
180 case 8:
181 return MVT::nxv8i16;
182 case 16:
183 return MVT::nxv16i8;
184 }
185}
186
187/// Returns true if VT's elements occupy the lowest bit positions of its
188/// associated register class without any intervening space.
189///
190/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
191/// same register class, but only nxv8f16 can be treated as a packed vector.
192static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
194 "Expected legal vector type!");
195 return VT.isFixedLengthVector() ||
197}
198
199// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
200// predicate and end with a passthru value matching the result type.
234
236 const AArch64Subtarget &STI)
237 : TargetLowering(TM), Subtarget(&STI) {
238 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
239 // we have to make something up. Arbitrarily, choose ZeroOrOne.
241 // When comparing vectors the result sets the different elements in the
242 // vector to all-one or all-zero.
244
245 // Set up the register classes.
246 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
247 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
248
249 if (Subtarget->hasLS64()) {
250 addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
253 }
254
255 if (Subtarget->hasFPARMv8()) {
256 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
257 addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
258 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
259 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
260 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
261 }
262
263 if (Subtarget->hasNEON()) {
264 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
265 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
266 // Someone set us up the NEON.
267 addDRTypeForNEON(MVT::v2f32);
268 addDRTypeForNEON(MVT::v8i8);
269 addDRTypeForNEON(MVT::v4i16);
270 addDRTypeForNEON(MVT::v2i32);
271 addDRTypeForNEON(MVT::v1i64);
272 addDRTypeForNEON(MVT::v1f64);
273 addDRTypeForNEON(MVT::v4f16);
274 if (Subtarget->hasBF16())
275 addDRTypeForNEON(MVT::v4bf16);
276
277 addQRTypeForNEON(MVT::v4f32);
278 addQRTypeForNEON(MVT::v2f64);
279 addQRTypeForNEON(MVT::v16i8);
280 addQRTypeForNEON(MVT::v8i16);
281 addQRTypeForNEON(MVT::v4i32);
282 addQRTypeForNEON(MVT::v2i64);
283 addQRTypeForNEON(MVT::v8f16);
284 if (Subtarget->hasBF16())
285 addQRTypeForNEON(MVT::v8bf16);
286 }
287
288 if (Subtarget->hasSVE()) {
289 // Add legal sve predicate types
290 addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
291 addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
292 addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
293 addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
294
295 // Add legal sve data types
296 addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
297 addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
298 addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
299 addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
300
301 addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
302 addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
303 addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
304 addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
305 addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
306 addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
307
308 if (Subtarget->hasBF16()) {
309 addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
310 addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
311 addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
312 }
313
314 if (Subtarget->useSVEForFixedLengthVectors()) {
316 if (useSVEForFixedLengthVectorVT(VT))
317 addRegisterClass(VT, &AArch64::ZPRRegClass);
318
320 if (useSVEForFixedLengthVectorVT(VT))
321 addRegisterClass(VT, &AArch64::ZPRRegClass);
322 }
323
324 for (auto VT : { MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64 }) {
333 }
334
335 for (auto VT :
339
340 for (auto VT :
342 MVT::nxv2f64 }) {
354
366 }
367 }
368
369 // Compute derived properties from the register classes
371
372 // Provide all sorts of operation actions
406
410
414
416
417 // Custom lowering hooks are needed for XOR
418 // to fold it into CSINC/CSINV.
421
422 // Virtually no operation on f128 is legal, but LLVM can't expand them when
423 // there's a valid register class, so we need custom operations in most cases.
447
448 // Lowering for many of the conversions is actually specified by the non-f128
449 // type. The LowerXXX function will be trivial when f128 isn't involved.
480
485
486 // Variable arguments.
491
492 // Variable-sized objects.
495
496 if (Subtarget->isTargetWindows())
498 else
500
501 // Constant pool entries
503
504 // BlockAddress
506
507 // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
516
517 // AArch64 lacks both left-rotate and popcount instructions.
523 }
524
525 // AArch64 doesn't have i32 MULH{S|U}.
528
529 // AArch64 doesn't have {U|S}MUL_LOHI.
532
536
539
545 }
552
553 // Custom lower Add/Sub/Mul with overflow.
566
575 if (Subtarget->hasFullFP16())
577 else
579
613
614 if (!Subtarget->hasFullFP16()) {
638
639 // promote v4f16 to v4f32 when that is known to be safe.
648
665
687 }
688
689 // AArch64 has implementations of a lot of rounding-like FP operations.
690 for (MVT Ty : {MVT::f32, MVT::f64}) {
706 }
707
708 if (Subtarget->hasFullFP16()) {
720 }
721
723
726
732
733 // Generate outline atomics library calls only if LSE was not specified for
734 // subtarget
735 if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
761#define LCALLNAMES(A, B, N) \
762 setLibcallName(A##N##_RELAX, #B #N "_relax"); \
763 setLibcallName(A##N##_ACQ, #B #N "_acq"); \
764 setLibcallName(A##N##_REL, #B #N "_rel"); \
765 setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
766#define LCALLNAME4(A, B) \
767 LCALLNAMES(A, B, 1) \
768 LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
769#define LCALLNAME5(A, B) \
770 LCALLNAMES(A, B, 1) \
771 LCALLNAMES(A, B, 2) \
772 LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
773 LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
774 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
775 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
776 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
777 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
778 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
779#undef LCALLNAMES
780#undef LCALLNAME4
781#undef LCALLNAME5
782 }
783
784 // 128-bit loads and stores can be done without expanding
787
788 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
789 // custom lowering, as there are no un-paired non-temporal stores and
790 // legalization will break up 256 bit inputs.
798
799 // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
800 // This requires the Performance Monitors extension.
801 if (Subtarget->hasPerfMon())
803
804 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
805 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
806 // Issue __sincos_stret if available.
809 } else {
812 }
813
814 if (Subtarget->getTargetTriple().isOSMSVCRT()) {
815 // MSVCRT doesn't have powi; fall back to pow
816 setLibcallName(RTLIB::POWI_F32, nullptr);
817 setLibcallName(RTLIB::POWI_F64, nullptr);
818 }
819
820 // Make floating-point constants legal for the large code model, so they don't
821 // become loads from the constant pool.
822 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
825 }
826
827 // AArch64 does not have floating-point extending loads, i1 sign-extending
828 // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
829 for (MVT VT : MVT::fp_valuetypes()) {
834 }
835 for (MVT VT : MVT::integer_valuetypes())
837
845
849
850 // Indexed loads and stores are supported.
851 for (unsigned im = (unsigned)ISD::PRE_INC;
869 }
870
871 // Trap.
875
876 // We combine OR nodes for bitfield operations.
878 // Try to create BICs for vector ANDs.
880
881 // Vector add and sub nodes may conceal a high-half opportunity.
882 // Also, try to fold ADD into CSINC/CSINV..
890
891 // TODO: Do the same for FP_TO_*INT_SAT.
895
896 // Try and combine setcc with csel
898
900
909 if (Subtarget->supportsAddressTopByteIgnored())
911
913
916
923
925
926 // In case of strict alignment, avoid an excessive number of byte wide stores.
930
935
937
941
943
945
947
948 // Set required alignment.
950 // Set preferred alignments.
953
954 // Only change the limit for entries in a jump table if specified by
955 // the sub target, but not at the command line.
956 unsigned MaxJT = STI.getMaximumJumpTableSize();
959
961
963
964 if (Subtarget->hasNEON()) {
965 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
966 // silliness like this:
993
999
1001
1002 // AArch64 doesn't have a direct vector ->f32 conversion instructions for
1003 // elements smaller than i32, so promote the input to i32 first.
1010
1011 // Similarly, there is no direct i32 -> f64 vector conversion instruction.
1016 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
1017 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1020
1021 if (Subtarget->hasFullFP16()) {
1026 } else {
1027 // when AArch64 doesn't have fullfp16 support, promote the input
1028 // to i32 first.
1033 }
1034
1043
1044 // AArch64 doesn't have MUL.2d:
1046 // Custom handling for some quad-vector types to detect MULL.
1050
1051 // Saturates
1052 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1058 }
1059
1061 MVT::v4i32}) {
1064 }
1065
1066 // Vector reductions
1067 for (MVT VT : { MVT::v4f16, MVT::v2f32,
1069 if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1072
1074 }
1075 }
1076 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1083 }
1085
1088 // Likewise, narrowing and extending vector loads/stores aren't handled
1089 // directly.
1092
1093 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1096 } else {
1099 }
1102
1105
1111 }
1112 }
1113
1114 // AArch64 has implementations of a lot of rounding-like FP operations.
1115 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) {
1123 }
1124
1125 if (Subtarget->hasFullFP16()) {
1126 for (MVT Ty : {MVT::v4f16, MVT::v8f16}) {
1134 }
1135 }
1136
1137 if (Subtarget->hasSVE())
1139
1141
1148 }
1149
1150 if (Subtarget->hasSVE()) {
1151 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1190
1196 }
1197
1198 // Illegal unpacked integer vector types.
1199 for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1202 }
1203
1204 // Legalize unpacked bitcasts to REINTERPRET_CAST.
1208
1209 for (auto VT : {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1}) {
1218
1222
1223 // There are no legal MVT::nxv16f## based types.
1224 if (VT != MVT::nxv16i1) {
1227 }
1228 }
1229
1230 // NEON doesn't support masked loads/stores/gathers/scatters, but SVE does
1238 }
1239
1242 // Avoid marking truncating FP stores as legal to prevent the
1243 // DAGCombiner from creating unsupported truncating stores.
1245 // SVE does not have floating-point extending loads.
1249 }
1250 }
1251
1252 // SVE supports truncating stores of 64 and 128-bit vectors
1258
1294
1296 }
1297
1298 for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1303 }
1304
1306
1309
1310 // NOTE: Currently this has to happen after computeRegisterProperties rather
1311 // than the preferred option of combining it with the addRegisterClass call.
1312 if (Subtarget->useSVEForFixedLengthVectors()) {
1314 if (useSVEForFixedLengthVectorVT(VT))
1315 addTypeForFixedLengthSVE(VT);
1317 if (useSVEForFixedLengthVectorVT(VT))
1318 addTypeForFixedLengthSVE(VT);
1319
1320 // 64bit results can mean a bigger than NEON input.
1321 for (auto VT : {MVT::v8i8, MVT::v4i16})
1324
1325 // 128bit results imply a bigger than NEON input.
1326 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1328 for (auto VT : {MVT::v8f16, MVT::v4f32})
1330
1331 // These operations are not supported on NEON but SVE can do them.
1370
1371 // Int operations with no NEON support.
1372 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1379 }
1380
1381 // FP operations with no NEON support.
1382 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32,
1385
1386 // Use SVE for vectors with more than 2 elements.
1387 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1389 }
1390
1395 }
1396
1398}
1399
1400void AArch64TargetLowering::addTypeForNEON(MVT VT) {
1401 assert(VT.isVector() && "VT should be a vector type");
1402
1403 if (VT.isFloatingPoint()) {
1407 }
1408
1409 // Mark vector float intrinsics as expand.
1410 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
1419 }
1420
1421 // But we do support custom-lowering for FCOPYSIGN.
1422 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
1423 ((VT == MVT::v4f16 || VT == MVT::v8f16) && Subtarget->hasFullFP16()))
1425
1437
1441 for (MVT InnerVT : MVT::all_valuetypes())
1443
1444 // CNT supports only B element sizes, then use UADDLP to widen.
1445 if (VT != MVT::v8i8 && VT != MVT::v16i8)
1447
1453
1456
1457 if (!VT.isFloatingPoint())
1459
1460 // [SU][MIN|MAX] are available for all NEON types apart from i64.
1461 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
1462 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
1463 setOperationAction(Opcode, VT, Legal);
1464
1465 // F[MIN|MAX][NUM|NAN] are available for all FP NEON types.
1466 if (VT.isFloatingPoint() &&
1468 (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
1469 for (unsigned Opcode :
1471 setOperationAction(Opcode, VT, Legal);
1472
1473 if (Subtarget->isLittleEndian()) {
1474 for (unsigned im = (unsigned)ISD::PRE_INC;
1478 }
1479 }
1480}
1481
1482void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
1483 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
1484
1485 // By default everything must be expanded.
1486 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1487 setOperationAction(Op, VT, Expand);
1488
1489 // We use EXTRACT_SUBVECTOR to "cast" a scalable vector to a fixed length one.
1491
1492 if (VT.isFloatingPoint()) {
1504 }
1505
1506 // Mark integer truncating stores as having custom lowering
1507 if (VT.isInteger()) {
1509 while (InnerVT != VT) {
1511 InnerVT = InnerVT.changeVectorElementType(
1512 MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
1513 }
1514 }
1515
1516 // Lower fixed length vector operations to scalable equivalents.
1598}
1599
1600void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
1601 addRegisterClass(VT, &AArch64::FPR64RegClass);
1602 addTypeForNEON(VT);
1603}
1604
1605void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
1606 addRegisterClass(VT, &AArch64::FPR128RegClass);
1607 addTypeForNEON(VT);
1608}
1609
1611 LLVMContext &C, EVT VT) const {
1612 if (!VT.isVector())
1613 return MVT::i32;
1614 if (VT.isScalableVector())
1617}
1618
1619static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
1620 const APInt &Demanded,
1622 unsigned NewOpc) {
1623 uint64_t OldImm = Imm, NewImm, Enc;
1624 uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
1625
1626 // Return if the immediate is already all zeros, all ones, a bimm32 or a
1627 // bimm64.
1628 if (Imm == 0 || Imm == Mask ||
1630 return false;
1631
1632 unsigned EltSize = Size;
1633 uint64_t DemandedBits = Demanded.getZExtValue();
1634
1635 // Clear bits that are not demanded.
1636 Imm &= DemandedBits;
1637
1638 while (true) {
1639 // The goal here is to set the non-demanded bits in a way that minimizes
1640 // the number of switching between 0 and 1. In order to achieve this goal,
1641 // we set the non-demanded bits to the value of the preceding demanded bits.
1642 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
1643 // non-demanded bit), we copy bit0 (1) to the least significant 'x',
1644 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
1645 // The final result is 0b11000011.
1646 uint64_t NonDemandedBits = ~DemandedBits;
1647 uint64_t InvertedImm = ~Imm & DemandedBits;
1648 uint64_t RotatedImm =
1649 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
1651 uint64_t Sum = RotatedImm + NonDemandedBits;
1652 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
1653 uint64_t Ones = (Sum + Carry) & NonDemandedBits;
1654 NewImm = (Imm | Ones) & Mask;
1655
1656 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
1657 // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
1658 // we halve the element size and continue the search.
1659 if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
1660 break;
1661
1662 // We cannot shrink the element size any further if it is 2-bits.
1663 if (EltSize == 2)
1664 return false;
1665
1666 EltSize /= 2;
1667 Mask >>= EltSize;
1668 uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
1669
1670 // Return if there is mismatch in any of the demanded bits of Imm and Hi.
1671 if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
1672 return false;
1673
1674 // Merge the upper and lower halves of Imm and DemandedBits.
1675 Imm |= Hi;
1677 }
1678
1680
1681 // Replicate the element across the register width.
1682 while (EltSize < Size) {
1683 NewImm |= NewImm << EltSize;
1684 EltSize *= 2;
1685 }
1686
1687 (void)OldImm;
1688 assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
1689 "demanded bits should never be altered");
1690 assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
1691
1692 // Create the new constant immediate node.
1693 EVT VT = Op.getValueType();
1694 SDLoc DL(Op);
1695 SDValue New;
1696
1697 // If the new constant immediate is all-zeros or all-ones, let the target
1698 // independent DAG combine optimize this node.
1699 if (NewImm == 0 || NewImm == OrigMask) {
1700 New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
1701 TLO.DAG.getConstant(NewImm, DL, VT));
1702 // Otherwise, create a machine node so that target independent DAG combine
1703 // doesn't undo this optimization.
1704 } else {
1706 SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
1707 New = SDValue(
1708 TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
1709 }
1710
1711 return TLO.CombineTo(Op, New);
1712}
1713
1715 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
1716 TargetLoweringOpt &TLO) const {
1717 // Delay this optimization to as late as possible.
1718 if (!TLO.LegalOps)
1719 return false;
1720
1722 return false;
1723
1724 EVT VT = Op.getValueType();
1725 if (VT.isVector())
1726 return false;
1727
1728 unsigned Size = VT.getSizeInBits();
1729 assert((Size == 32 || Size == 64) &&
1730 "i32 or i64 is expected after legalization.");
1731
1732 // Exit early if we demand all bits.
1733 if (DemandedBits.countPopulation() == Size)
1734 return false;
1735
1736 unsigned NewOpc;
1737 switch (Op.getOpcode()) {
1738 default:
1739 return false;
1740 case ISD::AND:
1741 NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
1742 break;
1743 case ISD::OR:
1744 NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
1745 break;
1746 case ISD::XOR:
1747 NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
1748 break;
1749 }
1750 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
1751 if (!C)
1752 return false;
1753 uint64_t Imm = C->getZExtValue();
1754 return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
1755}
1756
1757/// computeKnownBitsForTargetNode - Determine which of the bits specified in
1758/// Mask are known to be either zero or one and return them Known.
1760 const SDValue Op, KnownBits &Known,
1761 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
1762 switch (Op.getOpcode()) {
1763 default:
1764 break;
1765 case AArch64ISD::CSEL: {
1767 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
1768 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
1769 Known = KnownBits::commonBits(Known, Known2);
1770 break;
1771 }
1773 case AArch64ISD::ADDlow: {
1774 if (!Subtarget->isTargetILP32())
1775 break;
1776 // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
1777 Known.Zero = APInt::getHighBitsSet(64, 32);
1778 break;
1779 }
1781 ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
1782 Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
1783 switch (IntID) {
1784 default: return;
1785 case Intrinsic::aarch64_ldaxr:
1786 case Intrinsic::aarch64_ldxr: {
1787 unsigned BitWidth = Known.getBitWidth();
1788 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
1789 unsigned MemBits = VT.getScalarSizeInBits();
1791 return;
1792 }
1793 }
1794 break;
1795 }
1797 case ISD::INTRINSIC_VOID: {
1798 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
1799 switch (IntNo) {
1800 default:
1801 break;
1802 case Intrinsic::aarch64_neon_umaxv:
1803 case Intrinsic::aarch64_neon_uminv: {
1804 // Figure out the datatype of the vector operand. The UMINV instruction
1805 // will zero extend the result, so we can mark as known zero all the
1806 // bits larger than the element datatype. 32-bit or larget doesn't need
1807 // this as those are legal types and will be handled by isel directly.
1808 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
1809 unsigned BitWidth = Known.getBitWidth();
1810 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
1811 assert(BitWidth >= 8 && "Unexpected width!");
1813 Known.Zero |= Mask;
1814 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
1815 assert(BitWidth >= 16 && "Unexpected width!");
1817 Known.Zero |= Mask;
1818 }
1819 break;
1820 } break;
1821 }
1822 }
1823 }
1824}
1825
1830
1832 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1833 bool *Fast) const {
1834 if (Subtarget->requiresStrictAlign())
1835 return false;
1836
1837 if (Fast) {
1838 // Some CPUs are fine with unaligned stores except for 128-bit ones.
1839 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
1840 // See comments in performSTORECombine() for more details about
1841 // these conditions.
1842
1843 // Code that uses clang vector extensions can mark that it
1844 // wants unaligned accesses to be treated as fast by
1845 // underspecifying alignment to be 1 or 2.
1846 Alignment <= 2 ||
1847
1848 // Disregard v2i64. Memcpy lowering produces those and splitting
1849 // them regresses performance on micro-benchmarks and olden/bh.
1850 VT == MVT::v2i64;
1851 }
1852 return true;
1853}
1854
1855// Same as above but handling LLTs instead.
1857 LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1858 bool *Fast) const {
1859 if (Subtarget->requiresStrictAlign())
1860 return false;
1861
1862 if (Fast) {
1863 // Some CPUs are fine with unaligned stores except for 128-bit ones.
1864 *Fast = !Subtarget->isMisaligned128StoreSlow() ||
1865 Ty.getSizeInBytes() != 16 ||
1866 // See comments in performSTORECombine() for more details about
1867 // these conditions.
1868
1869 // Code that uses clang vector extensions can mark that it
1870 // wants unaligned accesses to be treated as fast by
1871 // underspecifying alignment to be 1 or 2.
1872 Alignment <= 2 ||
1873
1874 // Disregard v2i64. Memcpy lowering produces those and splitting
1875 // them regresses performance on micro-benchmarks and olden/bh.
1876 Ty == LLT::fixed_vector(2, 64);
1877 }
1878 return true;
1879}
1880
1881FastISel *
1886
1887const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
1888#define MAKE_CASE(V) \
1889 case V: \
1890 return #V;
1891 switch ((AArch64ISD::NodeType)Opcode) {
1893 break;
2171 }
2172#undef MAKE_CASE
2173 return nullptr;
2174}
2175
2178 MachineBasicBlock *MBB) const {
2179 // We materialise the F128CSEL pseudo-instruction as some control flow and a
2180 // phi node:
2181
2182 // OrigBB:
2183 // [... previous instrs leading to comparison ...]
2184 // b.ne TrueBB
2185 // b EndBB
2186 // TrueBB:
2187 // ; Fallthrough
2188 // EndBB:
2189 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2190
2191 MachineFunction *MF = MBB->getParent();
2192 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2193 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2194 DebugLoc DL = MI.getDebugLoc();
2196
2197 Register DestReg = MI.getOperand(0).getReg();
2198 Register IfTrueReg = MI.getOperand(1).getReg();
2199 Register IfFalseReg = MI.getOperand(2).getReg();
2200 unsigned CondCode = MI.getOperand(3).getImm();
2201 bool NZCVKilled = MI.getOperand(4).isKill();
2202
2205 MF->insert(It, TrueBB);
2206 MF->insert(It, EndBB);
2207
2208 // Transfer rest of current basic-block to EndBB
2209 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
2210 MBB->end());
2212
2213 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
2214 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
2215 MBB->addSuccessor(TrueBB);
2216 MBB->addSuccessor(EndBB);
2217
2218 // TrueBB falls through to the end.
2219 TrueBB->addSuccessor(EndBB);
2220
2221 if (!NZCVKilled) {
2222 TrueBB->addLiveIn(AArch64::NZCV);
2223 EndBB->addLiveIn(AArch64::NZCV);
2224 }
2225
2226 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
2228 .addMBB(TrueBB)
2230 .addMBB(MBB);
2231
2232 MI.eraseFromParent();
2233 return EndBB;
2234}
2235
2243
2245 MachineInstr &MI, MachineBasicBlock *BB) const {
2246 switch (MI.getOpcode()) {
2247 default:
2248#ifndef NDEBUG
2249 MI.dump();
2250#endif
2251 llvm_unreachable("Unexpected instruction for custom inserter!");
2252
2253 case AArch64::F128CSEL:
2254 return EmitF128CSEL(MI, BB);
2255
2256 case TargetOpcode::STACKMAP:
2257 case TargetOpcode::PATCHPOINT:
2258 case TargetOpcode::STATEPOINT:
2259 return emitPatchPoint(MI, BB);
2260
2261 case AArch64::CATCHRET:
2262 return EmitLoweredCatchRet(MI, BB);
2263 }
2264}
2265
2266//===----------------------------------------------------------------------===//
2267// AArch64 Lowering private implementation.
2268//===----------------------------------------------------------------------===//
2269
2270//===----------------------------------------------------------------------===//
2271// Lowering Code
2272//===----------------------------------------------------------------------===//
2273
2274// Forward declarations of SVE fixed length lowering helpers
2279 SelectionDAG &DAG);
2280
2281/// isZerosVector - Check whether SDNode N is a zero-filled vector.
2282static bool isZerosVector(const SDNode *N) {
2283 // Look through a bit convert.
2284 while (N->getOpcode() == ISD::BITCAST)
2285 N = N->getOperand(0).getNode();
2286
2288 return true;
2289
2290 if (N->getOpcode() != AArch64ISD::DUP)
2291 return false;
2292
2293 auto Opnd0 = N->getOperand(0);
2294 auto *CINT = dyn_cast<ConstantSDNode>(Opnd0);
2295 auto *CFP = dyn_cast<ConstantFPSDNode>(Opnd0);
2296 return (CINT && CINT->isNullValue()) || (CFP && CFP->isZero());
2297}
2298
2299/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
2300/// CC
2302 switch (CC) {
2303 default:
2304 llvm_unreachable("Unknown condition code!");
2305 case ISD::SETNE:
2306 return AArch64CC::NE;
2307 case ISD::SETEQ:
2308 return AArch64CC::EQ;
2309 case ISD::SETGT:
2310 return AArch64CC::GT;
2311 case ISD::SETGE:
2312 return AArch64CC::GE;
2313 case ISD::SETLT:
2314 return AArch64CC::LT;
2315 case ISD::SETLE:
2316 return AArch64CC::LE;
2317 case ISD::SETUGT:
2318 return AArch64CC::HI;
2319 case ISD::SETUGE:
2320 return AArch64CC::HS;
2321 case ISD::SETULT:
2322 return AArch64CC::LO;
2323 case ISD::SETULE:
2324 return AArch64CC::LS;
2325 }
2326}
2327
2328/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
2330 AArch64CC::CondCode &CondCode,
2333 switch (CC) {
2334 default:
2335 llvm_unreachable("Unknown FP condition!");
2336 case ISD::SETEQ:
2337 case ISD::SETOEQ:
2338 CondCode = AArch64CC::EQ;
2339 break;
2340 case ISD::SETGT:
2341 case ISD::SETOGT:
2342 CondCode = AArch64CC::GT;
2343 break;
2344 case ISD::SETGE:
2345 case ISD::SETOGE:
2346 CondCode = AArch64CC::GE;
2347 break;
2348 case ISD::SETOLT:
2349 CondCode = AArch64CC::MI;
2350 break;
2351 case ISD::SETOLE:
2352 CondCode = AArch64CC::LS;
2353 break;
2354 case ISD::SETONE:
2355 CondCode = AArch64CC::MI;
2357 break;
2358 case ISD::SETO:
2359 CondCode = AArch64CC::VC;
2360 break;
2361 case ISD::SETUO:
2362 CondCode = AArch64CC::VS;
2363 break;
2364 case ISD::SETUEQ:
2365 CondCode = AArch64CC::EQ;
2367 break;
2368 case ISD::SETUGT:
2369 CondCode = AArch64CC::HI;
2370 break;
2371 case ISD::SETUGE:
2372 CondCode = AArch64CC::PL;
2373 break;
2374 case ISD::SETLT:
2375 case ISD::SETULT:
2376 CondCode = AArch64CC::LT;
2377 break;
2378 case ISD::SETLE:
2379 case ISD::SETULE:
2380 CondCode = AArch64CC::LE;
2381 break;
2382 case ISD::SETNE:
2383 case ISD::SETUNE:
2384 CondCode = AArch64CC::NE;
2385 break;
2386 }
2387}
2388
2389/// Convert a DAG fp condition code to an AArch64 CC.
2390/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
2391/// should be AND'ed instead of OR'ed.
2393 AArch64CC::CondCode &CondCode,
2396 switch (CC) {
2397 default:
2398 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
2400 break;
2401 case ISD::SETONE:
2402 // (a one b)
2403 // == ((a olt b) || (a ogt b))
2404 // == ((a ord b) && (a une b))
2405 CondCode = AArch64CC::VC;
2407 break;
2408 case ISD::SETUEQ:
2409 // (a ueq b)
2410 // == ((a uno b) || (a oeq b))
2411 // == ((a ule b) && (a uge b))
2412 CondCode = AArch64CC::PL;
2414 break;
2415 }
2416}
2417
2418/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
2419/// CC usable with the vector instructions. Fewer operations are available
2420/// without a real NZCV register, so we have to use less efficient combinations
2421/// to get the same effect.
2423 AArch64CC::CondCode &CondCode,
2425 bool &Invert) {
2426 Invert = false;
2427 switch (CC) {
2428 default:
2429 // Mostly the scalar mappings work fine.
2430 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
2431 break;
2432 case ISD::SETUO:
2433 Invert = true;
2435 case ISD::SETO:
2436 CondCode = AArch64CC::MI;
2438 break;
2439 case ISD::SETUEQ:
2440 case ISD::SETULT:
2441 case ISD::SETULE:
2442 case ISD::SETUGT:
2443 case ISD::SETUGE:
2444 // All of the compare-mask comparisons are ordered, but we can switch
2445 // between the two by a double inversion. E.g. ULE == !OGT.
2446 Invert = true;
2447 changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
2448 CondCode, CondCode2);
2449 break;
2450 }
2451}
2452
2453static bool isLegalArithImmed(uint64_t C) {
2454 // Matches AArch64DAGToDAGISel::SelectArithImmed().
2455 bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
2456 LLVM_DEBUG(dbgs() << "Is imm " << C
2457 << " legal: " << (IsLegal ? "yes\n" : "no\n"));
2458 return IsLegal;
2459}
2460
2461// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
2462// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
2463// can be set differently by this operation. It comes down to whether
2464// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
2465// everything is fine. If not then the optimization is wrong. Thus general
2466// comparisons are only valid if op2 != 0.
2467//
2468// So, finally, the only LLVM-native comparisons that don't mention C and V
2469// are SETEQ and SETNE. They're the only ones we can safely use CMN for in
2470// the absence of information about op2.
2471static bool isCMN(SDValue Op, ISD::CondCode CC) {
2472 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
2473 (CC == ISD::SETEQ || CC == ISD::SETNE);
2474}
2475
2477 SelectionDAG &DAG, SDValue Chain,
2478 bool IsSignaling) {
2479 EVT VT = LHS.getValueType();
2480 assert(VT != MVT::f128);
2481 assert(VT != MVT::f16 && "Lowering of strict fp16 not yet implemented");
2482 unsigned Opcode =
2484 return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS});
2485}
2486
2488 const SDLoc &dl, SelectionDAG &DAG) {
2489 EVT VT = LHS.getValueType();
2490 const bool FullFP16 =
2491 static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
2492
2493 if (VT.isFloatingPoint()) {
2494 assert(VT != MVT::f128);
2495 if (VT == MVT::f16 && !FullFP16) {
2496 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
2497 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
2498 VT = MVT::f32;
2499 }
2500 return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
2501 }
2502
2503 // The CMP instruction is just an alias for SUBS, and representing it as
2504 // SUBS means that it's possible to get CSE with subtract operations.
2505 // A later phase can perform the optimization of setting the destination
2506 // register to WZR/XZR if it ends up being unused.
2507 unsigned Opcode = AArch64ISD::SUBS;
2508
2509 if (isCMN(RHS, CC)) {
2510 // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
2511 Opcode = AArch64ISD::ADDS;
2512 RHS = RHS.getOperand(1);
2513 } else if (isCMN(LHS, CC)) {
2514 // As we are looking for EQ/NE compares, the operands can be commuted ; can
2515 // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
2516 Opcode = AArch64ISD::ADDS;
2517 LHS = LHS.getOperand(1);
2518 } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
2519 if (LHS.getOpcode() == ISD::AND) {
2520 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
2521 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
2522 // of the signed comparisons.
2523 const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl,
2524 DAG.getVTList(VT, MVT_CC),
2525 LHS.getOperand(0),
2526 LHS.getOperand(1));
2527 // Replace all users of (and X, Y) with newly generated (ands X, Y)
2528 DAG.ReplaceAllUsesWith(LHS, ANDSNode);
2529 return ANDSNode.getValue(1);
2530 } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
2531 // Use result of ANDS
2532 return LHS.getValue(1);
2533 }
2534 }
2535
2536 return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
2537 .getValue(1);
2538}
2539
2540/// \defgroup AArch64CCMP CMP;CCMP matching
2541///
2542/// These functions deal with the formation of CMP;CCMP;... sequences.
2543/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
2544/// a comparison. They set the NZCV flags to a predefined value if their
2545/// predicate is false. This allows to express arbitrary conjunctions, for
2546/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
2547/// expressed as:
2548/// cmp A
2549/// ccmp B, inv(CB), CA
2550/// check for CB flags
2551///
2552/// This naturally lets us implement chains of AND operations with SETCC
2553/// operands. And we can even implement some other situations by transforming
2554/// them:
2555/// - We can implement (NEG SETCC) i.e. negating a single comparison by
2556/// negating the flags used in a CCMP/FCCMP operations.
2557/// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
2558/// by negating the flags we test for afterwards. i.e.
2559/// NEG (CMP CCMP CCCMP ...) can be implemented.
2560/// - Note that we can only ever negate all previously processed results.
2561/// What we can not implement by flipping the flags to test is a negation
2562/// of two sub-trees (because the negation affects all sub-trees emitted so
2563/// far, so the 2nd sub-tree we emit would also affect the first).
2564/// With those tools we can implement some OR operations:
2565/// - (OR (SETCC A) (SETCC B)) can be implemented via:
2566/// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
2567/// - After transforming OR to NEG/AND combinations we may be able to use NEG
2568/// elimination rules from earlier to implement the whole thing as a
2569/// CCMP/FCCMP chain.
2570///
2571/// As complete example:
2572/// or (or (setCA (cmp A)) (setCB (cmp B)))
2573/// (and (setCC (cmp C)) (setCD (cmp D)))"
2574/// can be reassociated to:
2575/// or (and (setCC (cmp C)) setCD (cmp D))
2576// (or (setCA (cmp A)) (setCB (cmp B)))
2577/// can be transformed to:
2578/// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
2579/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
2580/// which can be implemented as:
2581/// cmp C
2582/// ccmp D, inv(CD), CC
2583/// ccmp A, CA, inv(CD)
2584/// ccmp B, CB, inv(CA)
2585/// check for CB flags
2586///
2587/// A counterexample is "or (and A B) (and C D)" which translates to
2588/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
2589/// can only implement 1 of the inner (not) operations, but not both!
2590/// @{
2591
2592/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
2594 ISD::CondCode CC, SDValue CCOp,
2595 AArch64CC::CondCode Predicate,
2597 const SDLoc &DL, SelectionDAG &DAG) {
2598 unsigned Opcode = 0;
2599 const bool FullFP16 =
2600 static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
2601
2602 if (LHS.getValueType().isFloatingPoint()) {
2603 assert(LHS.getValueType() != MVT::f128);
2604 if (LHS.getValueType() == MVT::f16 && !FullFP16) {
2605 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
2606 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
2607 }
2608 Opcode = AArch64ISD::FCCMP;
2609 } else if (RHS.getOpcode() == ISD::SUB) {
2610 SDValue SubOp0 = RHS.getOperand(0);
2611 if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
2612 // See emitComparison() on why we can only do this for SETEQ and SETNE.
2613 Opcode = AArch64ISD::CCMN;
2614 RHS = RHS.getOperand(1);
2615 }
2616 }
2617 if (Opcode == 0)
2618 Opcode = AArch64ISD::CCMP;
2619
2620 SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
2624 return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
2625}
2626
2627/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
2628/// expressed as a conjunction. See \ref AArch64CCMP.
2629/// \param CanNegate Set to true if we can negate the whole sub-tree just by
2630/// changing the conditions on the SETCC tests.
2631/// (this means we can call emitConjunctionRec() with
2632/// Negate==true on this sub-tree)
2633/// \param MustBeFirst Set to true if this subtree needs to be negated and we
2634/// cannot do the negation naturally. We are required to
2635/// emit the subtree first in this case.
2636/// \param WillNegate Is true if are called when the result of this
2637/// subexpression must be negated. This happens when the
2638/// outer expression is an OR. We can use this fact to know
2639/// that we have a double negation (or (or ...) ...) that
2640/// can be implemented for free.
2641static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
2642 bool &MustBeFirst, bool WillNegate,
2643 unsigned Depth = 0) {
2644 if (!Val.hasOneUse())
2645 return false;
2646 unsigned Opcode = Val->getOpcode();
2647 if (Opcode == ISD::SETCC) {
2648 if (Val->getOperand(0).getValueType() == MVT::f128)
2649 return false;
2650 CanNegate = true;
2651 MustBeFirst = false;
2652 return true;
2653 }
2654 // Protect against exponential runtime and stack overflow.
2655 if (Depth > 6)
2656 return false;
2657 if (Opcode == ISD::AND || Opcode == ISD::OR) {
2658 bool IsOR = Opcode == ISD::OR;
2659 SDValue O0 = Val->getOperand(0);
2660 SDValue O1 = Val->getOperand(1);
2661 bool CanNegateL;
2662 bool MustBeFirstL;
2664 return false;
2665 bool CanNegateR;
2666 bool MustBeFirstR;
2668 return false;
2669
2671 return false;
2672
2673 if (IsOR) {
2674 // For an OR expression we need to be able to naturally negate at least
2675 // one side or we cannot do the transformation at all.
2676 if (!CanNegateL && !CanNegateR)
2677 return false;
2678 // If we the result of the OR will be negated and we can naturally negate
2679 // the leafs, then this sub-tree as a whole negates naturally.
2681 // If we cannot naturally negate the whole sub-tree, then this must be
2682 // emitted first.
2684 } else {
2685 assert(Opcode == ISD::AND && "Must be OR or AND");
2686 // We cannot naturally negate an AND operation.
2687 CanNegate = false;
2689 }
2690 return true;
2691 }
2692 return false;
2693}
2694
2695/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
2696/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
2697/// Tries to transform the given i1 producing node @p Val to a series compare
2698/// and conditional compare operations. @returns an NZCV flags producing node
2699/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
2700/// transformation was not possible.
2701/// \p Negate is true if we want this sub-tree being negated just by changing
2702/// SETCC conditions.
2704 AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
2705 AArch64CC::CondCode Predicate) {
2706 // We're at a tree leaf, produce a conditional comparison operation.
2707 unsigned Opcode = Val->getOpcode();
2708 if (Opcode == ISD::SETCC) {
2709 SDValue LHS = Val->getOperand(0);
2710 SDValue RHS = Val->getOperand(1);
2711 ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
2712 bool isInteger = LHS.getValueType().isInteger();
2713 if (Negate)
2714 CC = getSetCCInverse(CC, LHS.getValueType());
2715 SDLoc DL(Val);
2716 // Determine OutCC and handle FP special case.
2717 if (isInteger) {
2719 } else {
2720 assert(LHS.getValueType().isFloatingPoint());
2723 // Some floating point conditions can't be tested with a single condition
2724 // code. Construct an additional comparison in this case.
2725 if (ExtraCC != AArch64CC::AL) {
2727 if (!CCOp.getNode())
2728 ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
2729 else
2730 ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
2731 ExtraCC, DL, DAG);
2732 CCOp = ExtraCmp;
2733 Predicate = ExtraCC;
2734 }
2735 }
2736
2737 // Produce a normal comparison if we are first in the chain
2738 if (!CCOp)
2739 return emitComparison(LHS, RHS, CC, DL, DAG);
2740 // Otherwise produce a ccmp.
2741 return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
2742 DAG);
2743 }
2744 assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
2745
2746 bool IsOR = Opcode == ISD::OR;
2747
2748 SDValue LHS = Val->getOperand(0);
2749 bool CanNegateL;
2750 bool MustBeFirstL;
2752 assert(ValidL && "Valid conjunction/disjunction tree");
2753 (void)ValidL;
2754
2755 SDValue RHS = Val->getOperand(1);
2756 bool CanNegateR;
2757 bool MustBeFirstR;
2759 assert(ValidR && "Valid conjunction/disjunction tree");
2760 (void)ValidR;
2761
2762 // Swap sub-tree that must come first to the right side.
2763 if (MustBeFirstL) {
2764 assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
2765 std::swap(LHS, RHS);
2768 }
2769
2770 bool NegateR;
2771 bool NegateAfterR;
2772 bool NegateL;
2773 bool NegateAfterAll;
2774 if (Opcode == ISD::OR) {
2775 // Swap the sub-tree that we can negate naturally to the left.
2776 if (!CanNegateL) {
2777 assert(CanNegateR && "at least one side must be negatable");
2778 assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
2779 assert(!Negate);
2780 std::swap(LHS, RHS);
2781 NegateR = false;
2782 NegateAfterR = true;
2783 } else {
2784 // Negate the left sub-tree if possible, otherwise negate the result.
2787 }
2788 NegateL = true;
2789 NegateAfterAll = !Negate;
2790 } else {
2791 assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
2792 assert(!Negate && "Valid conjunction/disjunction tree");
2793
2794 NegateL = false;
2795 NegateR = false;
2796 NegateAfterR = false;
2797 NegateAfterAll = false;
2798 }
2799
2800 // Emit sub-trees.
2802 SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
2803 if (NegateAfterR)
2806 if (NegateAfterAll)
2808 return CmpL;
2809}
2810
2811/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
2812/// In some cases this is even possible with OR operations in the expression.
2813/// See \ref AArch64CCMP.
2814/// \see emitConjunctionRec().
2817 bool DummyCanNegate;
2818 bool DummyMustBeFirst;
2820 return SDValue();
2821
2822 return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
2823}
2824
2825/// @}
2826
2827/// Returns how profitable it is to fold a comparison's operand's shift and/or
2828/// extension operations.
2830 auto isSupportedExtend = [&](SDValue V) {
2831 if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
2832 return true;
2833
2834 if (V.getOpcode() == ISD::AND)
2835 if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
2836 uint64_t Mask = MaskCst->getZExtValue();
2837 return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
2838 }
2839
2840 return false;
2841 };
2842
2843 if (!Op.hasOneUse())
2844 return 0;
2845
2846 if (isSupportedExtend(Op))
2847 return 1;
2848
2849 unsigned Opc = Op.getOpcode();
2850 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
2851 if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
2852 uint64_t Shift = ShiftCst->getZExtValue();
2853 if (isSupportedExtend(Op.getOperand(0)))
2854 return (Shift <= 4) ? 2 : 1;
2855 EVT VT = Op.getValueType();
2856 if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
2857 return 1;
2858 }
2859
2860 return 0;
2861}
2862
2865 const SDLoc &dl) {
2866 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
2867 EVT VT = RHS.getValueType();
2868 uint64_t C = RHSC->getZExtValue();
2869 if (!isLegalArithImmed(C)) {
2870 // Constant does not fit, try adjusting it by one?
2871 switch (CC) {
2872 default:
2873 break;
2874 case ISD::SETLT:
2875 case ISD::SETGE:
2876 if ((VT == MVT::i32 && C != 0x80000000 &&
2877 isLegalArithImmed((uint32_t)(C - 1))) ||
2878 (VT == MVT::i64 && C != 0x80000000ULL &&
2879 isLegalArithImmed(C - 1ULL))) {
2880 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
2881 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
2882 RHS = DAG.getConstant(C, dl, VT);
2883 }
2884 break;
2885 case ISD::SETULT:
2886 case ISD::SETUGE:
2887 if ((VT == MVT::i32 && C != 0 &&
2888 isLegalArithImmed((uint32_t)(C - 1))) ||
2889 (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
2890 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
2891 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
2892 RHS = DAG.getConstant(C, dl, VT);
2893 }
2894 break;
2895 case ISD::SETLE:
2896 case ISD::SETGT:
2897 if ((VT == MVT::i32 && C != INT32_MAX &&
2898 isLegalArithImmed((uint32_t)(C + 1))) ||
2899 (VT == MVT::i64 && C != INT64_MAX &&
2900 isLegalArithImmed(C + 1ULL))) {
2901 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
2902 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
2903 RHS = DAG.getConstant(C, dl, VT);
2904 }
2905 break;
2906 case ISD::SETULE:
2907 case ISD::SETUGT:
2908 if ((VT == MVT::i32 && C != UINT32_MAX &&
2909 isLegalArithImmed((uint32_t)(C + 1))) ||
2910 (VT == MVT::i64 && C != UINT64_MAX &&
2911 isLegalArithImmed(C + 1ULL))) {
2912 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
2913 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
2914 RHS = DAG.getConstant(C, dl, VT);
2915 }
2916 break;
2917 }
2918 }
2919 }
2920
2921 // Comparisons are canonicalized so that the RHS operand is simpler than the
2922 // LHS one, the extreme case being when RHS is an immediate. However, AArch64
2923 // can fold some shift+extend operations on the RHS operand, so swap the
2924 // operands if that can be done.
2925 //
2926 // For example:
2927 // lsl w13, w11, #1
2928 // cmp w13, w12
2929 // can be turned into:
2930 // cmp w12, w11, lsl #1
2931 if (!isa<ConstantSDNode>(RHS) ||
2932 !isLegalArithImmed(cast<ConstantSDNode>(RHS)->getZExtValue())) {
2933 SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS;
2934
2936 std::swap(LHS, RHS);
2938 }
2939 }
2940
2941 SDValue Cmp;
2942 AArch64CC::CondCode AArch64CC;
2943 if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
2945
2946 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
2947 // For the i8 operand, the largest immediate is 255, so this can be easily
2948 // encoded in the compare instruction. For the i16 operand, however, the
2949 // largest immediate cannot be encoded in the compare.
2950 // Therefore, use a sign extending load and cmn to avoid materializing the
2951 // -1 constant. For example,
2952 // movz w1, #65535
2953 // ldrh w0, [x0, #0]
2954 // cmp w0, w1
2955 // >
2956 // ldrsh w0, [x0, #0]
2957 // cmn w0, #1
2958 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
2959 // if and only if (sext LHS) == (sext RHS). The checks are in place to
2960 // ensure both the LHS and RHS are truly zero extended and to make sure the
2961 // transformation is profitable.
2962 if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
2963 cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
2964 cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
2965 LHS.getNode()->hasNUsesOfValue(1, 0)) {
2966 int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
2968 SDValue SExt =
2969 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
2970 DAG.getValueType(MVT::i16));
2971 Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
2972 RHS.getValueType()),
2973 CC, dl, DAG);
2974 AArch64CC = changeIntCCToAArch64CC(CC);
2975 }
2976 }
2977
2978 if (!Cmp && (RHSC->isNullValue() || RHSC->isOne())) {
2979 if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
2980 if ((CC == ISD::SETNE) ^ RHSC->isNullValue())
2981 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
2982 }
2983 }
2984 }
2985
2986 if (!Cmp) {
2987 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
2988 AArch64CC = changeIntCCToAArch64CC(CC);
2989 }
2990 AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
2991 return Cmp;
2992}
2993
2994static std::pair<SDValue, SDValue>
2996 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
2997 "Unsupported value type");
2999 SDLoc DL(Op);
3000 SDValue LHS = Op.getOperand(0);
3001 SDValue RHS = Op.getOperand(1);
3002 unsigned Opc = 0;
3003 switch (Op.getOpcode()) {
3004 default:
3005 llvm_unreachable("Unknown overflow instruction!");
3006 case ISD::SADDO:
3007 Opc = AArch64ISD::ADDS;
3008 CC = AArch64CC::VS;
3009 break;
3010 case ISD::UADDO:
3011 Opc = AArch64ISD::ADDS;
3012 CC = AArch64CC::HS;
3013 break;
3014 case ISD::SSUBO:
3015 Opc = AArch64ISD::SUBS;
3016 CC = AArch64CC::VS;
3017 break;
3018 case ISD::USUBO:
3019 Opc = AArch64ISD::SUBS;
3020 CC = AArch64CC::LO;
3021 break;
3022 // Multiply needs a little bit extra work.
3023 case ISD::SMULO:
3024 case ISD::UMULO: {
3025 CC = AArch64CC::NE;
3026 bool IsSigned = Op.getOpcode() == ISD::SMULO;
3027 if (Op.getValueType() == MVT::i32) {
3028 // Extend to 64-bits, then perform a 64-bit multiply.
3029 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3030 LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
3031 RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
3032 SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
3034
3035 // Check that the result fits into a 32-bit integer.
3036 SDVTList VTs = DAG.getVTList(MVT::i64, MVT_CC);
3037 if (IsSigned) {
3038 // cmp xreg, wreg, sxtw
3040 Overflow =
3042 } else {
3043 // tst xreg, #0xffffffff00000000
3044 SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64);
3045 Overflow =
3047 }
3048 break;
3049 }
3050 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
3051 // For the 64 bit multiply
3052 Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
3053 if (IsSigned) {
3054 SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
3056 DAG.getConstant(63, DL, MVT::i64));
3057 // It is important that LowerBits is last, otherwise the arithmetic
3058 // shift will not be folded into the compare (SUBS).
3059 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
3061 .getValue(1);
3062 } else {
3063 SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
3064 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
3065 Overflow =
3066 DAG.getNode(AArch64ISD::SUBS, DL, VTs,
3067 DAG.getConstant(0, DL, MVT::i64),
3068 UpperBits).getValue(1);
3069 }
3070 break;
3071 }
3072 } // switch (...)
3073
3074 if (Opc) {
3075 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
3076
3077 // Emit the AArch64 operation with overflow check.
3078 Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
3079 Overflow = Value.getValue(1);
3080 }
3081 return std::make_pair(Value, Overflow);
3082}
3083
3084SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
3085 if (useSVEForFixedLengthVectorVT(Op.getValueType()))
3086 return LowerToScalableOp(Op, DAG);
3087
3088 SDValue Sel = Op.getOperand(0);
3089 SDValue Other = Op.getOperand(1);
3090 SDLoc dl(Sel);
3091
3092 // If the operand is an overflow checking operation, invert the condition
3093 // code and kill the Not operation. I.e., transform:
3094 // (xor (overflow_op_bool, 1))
3095 // -->
3096 // (csel 1, 0, invert(cc), overflow_op_bool)
3097 // ... which later gets transformed to just a cset instruction with an
3098 // inverted condition code, rather than a cset + eor sequence.
3100 // Only lower legal XALUO ops.
3101 if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0)))
3102 return SDValue();
3103
3104 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
3105 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
3108 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
3109 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
3110 return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
3111 CCVal, Overflow);
3112 }
3113 // If neither operand is a SELECT_CC, give up.
3114 if (Sel.getOpcode() != ISD::SELECT_CC)
3116 if (Sel.getOpcode() != ISD::SELECT_CC)
3117 return Op;
3118
3119 // The folding we want to perform is:
3120 // (xor x, (select_cc a, b, cc, 0, -1) )
3121 // -->
3122 // (csel x, (xor x, -1), cc ...)
3123 //
3124 // The latter will get matched to a CSINV instruction.
3125
3126 ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
3127 SDValue LHS = Sel.getOperand(0);
3128 SDValue RHS = Sel.getOperand(1);
3129 SDValue TVal = Sel.getOperand(2);
3130 SDValue FVal = Sel.getOperand(3);
3131
3132 // FIXME: This could be generalized to non-integer comparisons.
3133 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
3134 return Op;
3135
3138
3139 // The values aren't constants, this isn't the pattern we're looking for.
3140 if (!CFVal || !CTVal)
3141 return Op;
3142
3143 // We can commute the SELECT_CC by inverting the condition. This
3144 // might be needed to make this fit into a CSINV pattern.
3145 if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
3148 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
3149 }
3150
3151 // If the constants line up, perform the transform!
3152 if (CTVal->isNullValue() && CFVal->isAllOnesValue()) {
3153 SDValue CCVal;
3154 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
3155
3156 FVal = Other;
3157 TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
3158 DAG.getConstant(-1ULL, dl, Other.getValueType()));
3159
3160 return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
3161 CCVal, Cmp);
3162 }
3163
3164 return Op;
3165}
3166
3168 EVT VT = Op.getValueType();
3169
3170 // Let legalize expand this if it isn't a legal type yet.
3171 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
3172 return SDValue();
3173
3174 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
3175
3176 unsigned Opc;
3177 bool ExtraOp = false;
3178 switch (Op.getOpcode()) {
3179 default:
3180 llvm_unreachable("Invalid code");
3181 case ISD::ADDC:
3182 Opc = AArch64ISD::ADDS;
3183 break;
3184 case ISD::SUBC:
3185 Opc = AArch64ISD::SUBS;
3186 break;
3187 case ISD::ADDE:
3188 Opc = AArch64ISD::ADCS;
3189 ExtraOp = true;
3190 break;
3191 case ISD::SUBE:
3192 Opc = AArch64ISD::SBCS;
3193 ExtraOp = true;
3194 break;
3195 }
3196
3197 if (!ExtraOp)
3198 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
3199 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
3200 Op.getOperand(2));
3201}
3202
3204 // Let legalize expand this if it isn't a legal type yet.
3205 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
3206 return SDValue();
3207
3208 SDLoc dl(Op);
3210 // The actual operation that sets the overflow or carry flag.
3212 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
3213
3214 // We use 0 and 1 as false and true values.
3215 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
3216 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
3217
3218 // We use an inverted condition, because the conditional select is inverted
3219 // too. This will allow it to be selected to a single instruction:
3220 // CSINC Wd, WZR, WZR, invert(cond).
3221 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
3223 CCVal, Overflow);
3224
3225 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
3226 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
3227}
3228
3229// Prefetch operands are:
3230// 1: Address to prefetch
3231// 2: bool isWrite
3232// 3: int locality (0 = no locality ... 3 = extreme locality)
3233// 4: bool isDataCache
3235 SDLoc DL(Op);
3236 unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
3237 unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
3238 unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
3239
3240 bool IsStream = !Locality;
3241 // When the locality number is set
3242 if (Locality) {
3243 // The front-end should have filtered out the out-of-range values
3244 assert(Locality <= 3 && "Prefetch locality out-of-range");
3245 // The locality degree is the opposite of the cache speed.
3246 // Put the number the other way around.
3247 // The encoding starts at 0 for level 1
3248 Locality = 3 - Locality;
3249 }
3250
3251 // built the mask value encoding the expected behavior.
3252 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
3253 (!IsData << 3) | // IsDataCache bit
3254 (Locality << 1) | // Cache level bits
3255 (unsigned)IsStream; // Stream bit
3256 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
3257 DAG.getConstant(PrfOp, DL, MVT::i32), Op.getOperand(1));
3258}
3259
3260SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
3261 SelectionDAG &DAG) const {
3262 EVT VT = Op.getValueType();
3263 if (VT.isScalableVector())
3264 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
3265
3266 if (useSVEForFixedLengthVectorVT(VT))
3267 return LowerFixedLengthFPExtendToSVE(Op, DAG);
3268
3269 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
3270 return SDValue();
3271}
3272
3273SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
3274 SelectionDAG &DAG) const {
3275 if (Op.getValueType().isScalableVector())
3276 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
3277
3278 bool IsStrict = Op->isStrictFPOpcode();
3279 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
3280 EVT SrcVT = SrcVal.getValueType();
3281
3282 if (useSVEForFixedLengthVectorVT(SrcVT))
3283 return LowerFixedLengthFPRoundToSVE(Op, DAG);
3284
3285 if (SrcVT != MVT::f128) {
3286 // Expand cases where the input is a vector bigger than NEON.
3287 if (useSVEForFixedLengthVectorVT(SrcVT))
3288 return SDValue();
3289
3290 // It's legal except when f128 is involved
3291 return Op;
3292 }
3293
3294 return SDValue();
3295}
3296
3297SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
3298 SelectionDAG &DAG) const {
3299 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
3300 // Any additional optimization in this function should be recorded
3301 // in the cost tables.
3302 EVT InVT = Op.getOperand(0).getValueType();
3303 EVT VT = Op.getValueType();
3304
3305 if (VT.isScalableVector()) {
3306 unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
3309 return LowerToPredicatedOp(Op, DAG, Opcode);
3310 }
3311
3312 if (useSVEForFixedLengthVectorVT(VT) || useSVEForFixedLengthVectorVT(InVT))
3313 return LowerFixedLengthFPToIntToSVE(Op, DAG);
3314
3315 unsigned NumElts = InVT.getVectorNumElements();
3316
3317 // f16 conversions are promoted to f32 when full fp16 is not supported.
3318 if (InVT.getVectorElementType() == MVT::f16 &&
3319 !Subtarget->hasFullFP16()) {
3321 SDLoc dl(Op);
3322 return DAG.getNode(
3323 Op.getOpcode(), dl, Op.getValueType(),
3324 DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
3325 }
3326
3327 uint64_t VTSize = VT.getFixedSizeInBits();
3328 uint64_t InVTSize = InVT.getFixedSizeInBits();
3329 if (VTSize < InVTSize) {
3330 SDLoc dl(Op);
3331 SDValue Cv =
3332 DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
3333 Op.getOperand(0));
3334 return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
3335 }
3336
3337 if (VTSize > InVTSize) {
3338 SDLoc dl(Op);
3339 MVT ExtVT =
3342 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
3343 return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
3344 }
3345
3346 // Type changing conversions are illegal.
3347 return Op;
3348}
3349
3350SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
3351 SelectionDAG &DAG) const {
3352 bool IsStrict = Op->isStrictFPOpcode();
3353 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
3354
3355 if (SrcVal.getValueType().isVector())
3356 return LowerVectorFP_TO_INT(Op, DAG);
3357
3358 // f16 conversions are promoted to f32 when full fp16 is not supported.
3359 if (SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
3360 assert(!IsStrict && "Lowering of strict fp16 not yet implemented");
3361 SDLoc dl(Op);
3362 return DAG.getNode(
3363 Op.getOpcode(), dl, Op.getValueType(),
3365 }
3366
3367 if (SrcVal.getValueType() != MVT::f128) {
3368 // It's legal except when f128 is involved
3369 return Op;
3370 }
3371
3372 return SDValue();
3373}
3374
3375SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
3376 SelectionDAG &DAG) const {
3377 // AArch64 FP-to-int conversions saturate to the destination register size, so
3378 // we can lower common saturating conversions to simple instructions.
3379 SDValue SrcVal = Op.getOperand(0);
3380
3381 EVT SrcVT = SrcVal.getValueType();
3382 EVT DstVT = Op.getValueType();
3383
3384 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
3385 uint64_t SatWidth = SatVT.getScalarSizeInBits();
3386 uint64_t DstWidth = DstVT.getScalarSizeInBits();
3387 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
3388
3389 // TODO: Support lowering of NEON and SVE conversions.
3390 if (SrcVT.isVector())
3391 return SDValue();
3392
3393 // TODO: Saturate to SatWidth explicitly.
3394 if (SatWidth != DstWidth)
3395 return SDValue();
3396
3397 // In the absence of FP16 support, promote f32 to f16, like LowerFP_TO_INT().
3398 if (SrcVT == MVT::f16 && !Subtarget->hasFullFP16())
3399 return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
3401 Op.getOperand(1));
3402
3403 // Cases that we can emit directly.
3404 if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 ||
3405 (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
3406 (DstVT == MVT::i64 || DstVT == MVT::i32))
3407 return Op;
3408
3409 // For all other cases, fall back on the expanded form.
3410 return SDValue();
3411}
3412
3413SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
3414 SelectionDAG &DAG) const {
3415 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
3416 // Any additional optimization in this function should be recorded
3417 // in the cost tables.
3418 EVT VT = Op.getValueType();
3419 SDLoc dl(Op);
3420 SDValue In = Op.getOperand(0);
3421 EVT InVT = In.getValueType();
3422 unsigned Opc = Op.getOpcode();
3423 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
3424
3425 if (VT.isScalableVector()) {
3426 if (InVT.getVectorElementType() == MVT::i1) {
3427 // We can't directly extend an SVE predicate; extend it first.
3428 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3430 In = DAG.getNode(CastOpc, dl, CastVT, In);
3431 return DAG.getNode(Opc, dl, VT, In);
3432 }
3433
3434 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
3436 return LowerToPredicatedOp(Op, DAG, Opcode);
3437 }
3438
3439 if (useSVEForFixedLengthVectorVT(VT) || useSVEForFixedLengthVectorVT(InVT))
3440 return LowerFixedLengthIntToFPToSVE(Op, DAG);
3441
3442 uint64_t VTSize = VT.getFixedSizeInBits();
3443 uint64_t InVTSize = InVT.getFixedSizeInBits();
3444 if (VTSize < InVTSize) {
3445 MVT CastVT =
3446 MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()),
3447 InVT.getVectorNumElements());
3448 In = DAG.getNode(Opc, dl, CastVT, In);
3449 return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl));
3450 }
3451
3452 if (VTSize > InVTSize) {
3453 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3455 In = DAG.getNode(CastOpc, dl, CastVT, In);
3456 return DAG.getNode(Opc, dl, VT, In);
3457 }
3458
3459 return Op;
3460}
3461
3462SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
3463 SelectionDAG &DAG) const {
3464 if (Op.getValueType().isVector())
3465 return LowerVectorINT_TO_FP(Op, DAG);
3466
3467 bool IsStrict = Op->isStrictFPOpcode();
3468 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
3469
3470 // f16 conversions are promoted to f32 when full fp16 is not supported.
3471 if (Op.getValueType() == MVT::f16 &&
3472 !Subtarget->hasFullFP16()) {
3473 assert(!IsStrict && "Lowering of strict fp16 not yet implemented");
3474 SDLoc dl(Op);
3475 return DAG.getNode(
3477 DAG.getNode(Op.getOpcode(), dl, MVT::f32, SrcVal),
3478 DAG.getIntPtrConstant(0, dl));
3479 }
3480
3481 // i128 conversions are libcalls.
3482 if (SrcVal.getValueType() == MVT::i128)
3483 return SDValue();
3484
3485 // Other conversions are legal, unless it's to the completely software-based
3486 // fp128.
3487 if (Op.getValueType() != MVT::f128)
3488 return Op;
3489 return SDValue();
3490}
3491
3492SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
3493 SelectionDAG &DAG) const {
3494 // For iOS, we want to call an alternative entry point: __sincos_stret,
3495 // which returns the values in two S / D registers.
3496 SDLoc dl(Op);
3497 SDValue Arg = Op.getOperand(0);
3498 EVT ArgVT = Arg.getValueType();
3499 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
3500
3501 ArgListTy Args;
3502 ArgListEntry Entry;
3503
3504 Entry.Node = Arg;
3505 Entry.Ty = ArgTy;
3506 Entry.IsSExt = false;
3507 Entry.IsZExt = false;
3508 Args.push_back(Entry);
3509
3510 RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
3511 : RTLIB::SINCOS_STRET_F32;
3512 const char *LibcallName = getLibcallName(LC);
3513 SDValue Callee =
3515
3518 CLI.setDebugLoc(dl)
3519 .setChain(DAG.getEntryNode())
3520 .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
3521
3522 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
3523 return CallResult.first;
3524}
3525
3527
3528SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
3529 SelectionDAG &DAG) const {
3530 EVT OpVT = Op.getValueType();
3531 EVT ArgVT = Op.getOperand(0).getValueType();
3532
3533 if (useSVEForFixedLengthVectorVT(OpVT))
3534 return LowerFixedLengthBitcastToSVE(Op, DAG);
3535
3536 if (OpVT.isScalableVector()) {
3537 if (isTypeLegal(OpVT) && !isTypeLegal(ArgVT)) {
3538 assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
3539 "Expected int->fp bitcast!");
3542 Op.getOperand(0));
3543 return getSVESafeBitCast(OpVT, ExtResult, DAG);
3544 }
3545 return getSVESafeBitCast(OpVT, Op.getOperand(0), DAG);
3546 }
3547
3548 if (OpVT != MVT::f16 && OpVT != MVT::bf16)
3549 return SDValue();
3550
3551 assert(ArgVT == MVT::i16);
3552 SDLoc DL(Op);
3553
3554 Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
3555 Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
3556 return SDValue(
3557 DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, OpVT, Op,
3558 DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
3559 0);
3560}
3561
3563 if (OrigVT.getSizeInBits() >= 64)
3564 return OrigVT;
3565
3566 assert(OrigVT.isSimple() && "Expecting a simple value type");
3567
3568 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
3569 switch (OrigSimpleTy) {
3570 default: llvm_unreachable("Unexpected Vector Type");
3571 case MVT::v2i8:
3572 case MVT::v2i16:
3573 return MVT::v2i32;
3574 case MVT::v4i8:
3575 return MVT::v4i16;
3576 }
3577}
3578
3580 const EVT &OrigTy,
3581 const EVT &ExtTy,
3582 unsigned ExtOpcode) {
3583 // The vector originally had a size of OrigTy. It was then extended to ExtTy.
3584 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
3585 // 64-bits we need to insert a new extension so that it will be 64-bits.
3586 assert(ExtTy.is128BitVector() && "Unexpected extension size");
3587 if (OrigTy.getSizeInBits() >= 64)
3588 return N;
3589
3590 // Must extend size to at least 64 bits to be used as an operand for VMULL.
3591 EVT NewVT = getExtensionTo64Bits(OrigTy);
3592
3593 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
3594}
3595
3597 bool isSigned) {
3598 EVT VT = N->getValueType(0);
3599
3600 if (N->getOpcode() != ISD::BUILD_VECTOR)
3601 return false;
3602
3603 for (const SDValue &Elt : N->op_values()) {
3605 unsigned EltSize = VT.getScalarSizeInBits();
3606 unsigned HalfSize = EltSize / 2;
3607 if (isSigned) {
3608 if (!isIntN(HalfSize, C->getSExtValue()))
3609 return false;
3610 } else {
3611 if (!isUIntN(HalfSize, C->getZExtValue()))
3612 return false;
3613 }
3614 continue;
3615 }
3616 return false;
3617 }
3618
3619 return true;
3620}
3621
3623 if (N->getOpcode() == ISD::SIGN_EXTEND ||
3624 N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND)
3625 return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG,
3626 N->getOperand(0)->getValueType(0),
3627 N->getValueType(0),
3628 N->getOpcode());
3629
3630 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
3631 EVT VT = N->getValueType(0);
3632 SDLoc dl(N);
3633 unsigned EltSize = VT.getScalarSizeInBits() / 2;
3634 unsigned NumElts = VT.getVectorNumElements();
3635 MVT TruncVT = MVT::getIntegerVT(EltSize);
3637 for (unsigned i = 0; i != NumElts; ++i) {
3638 ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
3639 const APInt &CInt = C->getAPIntValue();
3640 // Element types smaller than 32 bits are not legal, so use i32 elements.
3641 // The values are implicitly truncated so sext vs. zext doesn't matter.
3642 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
3643 }
3644 return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
3645}
3646
3648 return N->getOpcode() == ISD::SIGN_EXTEND ||
3649 N->getOpcode() == ISD::ANY_EXTEND ||
3650 isExtendedBUILD_VECTOR(N, DAG, true);
3651}
3652
3654 return N->getOpcode() == ISD::ZERO_EXTEND ||
3655 N->getOpcode() == ISD::ANY_EXTEND ||
3656 isExtendedBUILD_VECTOR(N, DAG, false);
3657}
3658
3659static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
3660 unsigned Opcode = N->getOpcode();
3661 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
3662 SDNode *N0 = N->getOperand(0).getNode();
3663 SDNode *N1 = N->getOperand(1).getNode();
3664 return N0->hasOneUse() && N1->hasOneUse() &&
3665 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
3666 }
3667 return false;
3668}
3669
3670static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
3671 unsigned Opcode = N->getOpcode();
3672 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
3673 SDNode *N0 = N->getOperand(0).getNode();
3674 SDNode *N1 = N->getOperand(1).getNode();
3675 return N0->hasOneUse() && N1->hasOneUse() &&
3676 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
3677 }
3678 return false;
3679}
3680
3681SDValue AArch64TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
3682 SelectionDAG &DAG) const {
3683 // The rounding mode is in bits 23:22 of the FPSCR.
3684 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
3685 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
3686 // so that the shift + and get folded into a bitfield extract.
3687 SDLoc dl(Op);
3688
3689 SDValue Chain = Op.getOperand(0);
3690 SDValue FPCR_64 = DAG.getNode(
3692 {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)});
3693 Chain = FPCR_64.getValue(1);
3696 DAG.getConstant(1U << 22, dl, MVT::i32));
3698 DAG.getConstant(22, dl, MVT::i32));
3700 DAG.getConstant(3, dl, MVT::i32));
3701 return DAG.getMergeValues({AND, Chain}, dl);
3702}
3703
3704SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,
3705 SelectionDAG &DAG) const {
3706 SDLoc DL(Op);
3707 SDValue Chain = Op->getOperand(0);
3708 SDValue RMValue = Op->getOperand(1);
3709
3710 // The rounding mode is in bits 23:22 of the FPCR.
3711 // The llvm.set.rounding argument value to the rounding mode in FPCR mapping
3712 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
3713 // ((arg - 1) & 3) << 22).
3714 //
3715 // The argument of llvm.set.rounding must be within the segment [0, 3], so
3716 // NearestTiesToAway (4) is not handled here. It is responsibility of the code
3717 // generated llvm.set.rounding to ensure this condition.
3718
3719 // Calculate new value of FPCR[23:22].
3721 DAG.getConstant(1, DL, MVT::i32));
3723 DAG.getConstant(0x3, DL, MVT::i32));
3724 RMValue =
3728
3729 // Get current value of FPCR.
3730 SDValue Ops[] = {
3731 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
3732 SDValue FPCR =
3734 Chain = FPCR.getValue(1);
3735 FPCR = FPCR.getValue(0);
3736
3737 // Put new rounding mode into FPSCR[23:22].
3740 DAG.getConstant(RMMask, DL, MVT::i64));
3742 SDValue Ops2[] = {
3743 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
3744 FPCR};
3746}
3747
3748SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
3749 EVT VT = Op.getValueType();
3750
3751 // If SVE is available then i64 vector multiplications can also be made legal.
3752 bool OverrideNEON = VT == MVT::v2i64 || VT == MVT::v1i64;
3753
3754 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
3755 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED, OverrideNEON);
3756
3757 // Multiplications are only custom-lowered for 128-bit vectors so that
3758 // VMULL can be detected. Otherwise v2i64 multiplications are not legal.
3759 assert(VT.is128BitVector() && VT.isInteger() &&
3760 "unexpected type for custom-lowering ISD::MUL");
3761 SDNode *N0 = Op.getOperand(0).getNode();
3762 SDNode *N1 = Op.getOperand(1).getNode();
3763 unsigned NewOpc = 0;
3764 bool isMLA = false;
3765 bool isN0SExt = isSignExtended(N0, DAG);
3766 bool isN1SExt = isSignExtended(N1, DAG);
3767 if (isN0SExt && isN1SExt)
3769 else {
3770 bool isN0ZExt = isZeroExtended(N0, DAG);
3771 bool isN1ZExt = isZeroExtended(N1, DAG);
3772 if (isN0ZExt && isN1ZExt)
3774 else if (isN1SExt || isN1ZExt) {
3775 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
3776 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
3777 if (isN1SExt && isAddSubSExt(N0, DAG)) {
3779 isMLA = true;
3780 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
3782 isMLA = true;
3783 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
3784 std::swap(N0, N1);
3786 isMLA = true;
3787 }
3788 }
3789
3790 if (!NewOpc) {
3791 if (VT == MVT::v2i64)
3792 // Fall through to expand this. It is not legal.
3793 return SDValue();
3794 else
3795 // Other vector multiplications are legal.
3796 return Op;
3797 }
3798 }
3799
3800 // Legalize to a S/UMULL instruction
3801 SDLoc DL(Op);
3802 SDValue Op0;
3804 if (!isMLA) {
3805 Op0 = skipExtensionForVectorMULL(N0, DAG);
3807 Op1.getValueType().is64BitVector() &&
3808 "unexpected types for extended operands to VMULL");
3809 return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
3810 }
3811 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
3812 // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
3813 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
3816 EVT Op1VT = Op1.getValueType();
3817 return DAG.getNode(N0->getOpcode(), DL, VT,
3818 DAG.getNode(NewOpc, DL, VT,
3819 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
3820 DAG.getNode(NewOpc, DL, VT,
3821 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
3822}
3823
3824static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
3825 int Pattern) {
3826 return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
3828}
3829
3831 SDLoc DL(Op);
3832 EVT OutVT = Op.getValueType();
3833 SDValue InOp = Op.getOperand(1);
3834 EVT InVT = InOp.getValueType();
3835
3836 // Return the operand if the cast isn't changing type,
3837 // i.e. <n x 16 x i1> -> <n x 16 x i1>
3838 if (InVT == OutVT)
3839 return InOp;
3840
3843
3844 // If the argument converted to an svbool is a ptrue or a comparison, the
3845 // lanes introduced by the widening are zero by construction.
3846 switch (InOp.getOpcode()) {
3848 return Reinterpret;
3850 if (InOp.getConstantOperandVal(0) == Intrinsic::aarch64_sve_ptrue)
3851 return Reinterpret;
3852 }
3853
3854 // Otherwise, zero the newly introduced lanes.
3855 SDValue Mask = getPTrue(DAG, DL, InVT, AArch64SVEPredPattern::all);
3859}
3860
3861SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
3862 SelectionDAG &DAG) const {
3863 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
3864 SDLoc dl(Op);
3865 switch (IntNo) {
3866 default: return SDValue(); // Don't custom lower most intrinsics.
3867 case Intrinsic::thread_pointer: {
3869 return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
3870 }
3871 case Intrinsic::aarch64_neon_abs: {
3872 EVT Ty = Op.getValueType();
3873 if (Ty == MVT::i64) {
3875 Op.getOperand(1));
3876 Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result);
3877 return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result);
3878 } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
3879 return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1));
3880 } else {
3881 report_fatal_error("Unexpected type for AArch64 NEON intrinic");
3882 }
3883 }
3884 case Intrinsic::aarch64_neon_smax:
3885 return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
3886 Op.getOperand(1), Op.getOperand(2));
3887 case Intrinsic::aarch64_neon_umax:
3888 return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
3889 Op.getOperand(1), Op.getOperand(2));
3890 case Intrinsic::aarch64_neon_smin:
3891 return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
3892 Op.getOperand(1), Op.getOperand(2));
3893 case Intrinsic::aarch64_neon_umin:
3894 return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
3895 Op.getOperand(1), Op.getOperand(2));
3896
3897 case Intrinsic::aarch64_sve_sunpkhi:
3898 return DAG.getNode(AArch64ISD::SUNPKHI, dl, Op.getValueType(),
3899 Op.getOperand(1));
3900 case Intrinsic::aarch64_sve_sunpklo:
3901 return DAG.getNode(AArch64ISD::SUNPKLO, dl, Op.getValueType(),
3902 Op.getOperand(1));
3903 case Intrinsic::aarch64_sve_uunpkhi:
3904 return DAG.getNode(AArch64ISD::UUNPKHI, dl, Op.getValueType(),
3905 Op.getOperand(1));
3906 case Intrinsic::aarch64_sve_uunpklo:
3907 return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(),
3908 Op.getOperand(1));
3909 case Intrinsic::aarch64_sve_clasta_n:
3910 return DAG.getNode(AArch64ISD::CLASTA_N, dl, Op.getValueType(),
3911 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3912 case Intrinsic::aarch64_sve_clastb_n:
3913 return DAG.getNode(AArch64ISD::CLASTB_N, dl, Op.getValueType(),
3914 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3915 case Intrinsic::aarch64_sve_lasta:
3916 return DAG.getNode(AArch64ISD::LASTA, dl, Op.getValueType(),
3917 Op.getOperand(1), Op.getOperand(2));
3918 case Intrinsic::aarch64_sve_lastb:
3919 return DAG.getNode(AArch64ISD::LASTB, dl, Op.getValueType(),
3920 Op.getOperand(1), Op.getOperand(2));
3921 case Intrinsic::aarch64_sve_rev:
3922 return DAG.getNode(ISD::VECTOR_REVERSE, dl, Op.getValueType(),
3923 Op.getOperand(1));
3924 case Intrinsic::aarch64_sve_tbl:
3925 return DAG.getNode(AArch64ISD::TBL, dl, Op.getValueType(),
3926 Op.getOperand(1), Op.getOperand(2));
3927 case Intrinsic::aarch64_sve_trn1:
3928 return DAG.getNode(AArch64ISD::TRN1, dl, Op.getValueType(),
3929 Op.getOperand(1), Op.getOperand(2));
3930 case Intrinsic::aarch64_sve_trn2:
3931 return DAG.getNode(AArch64ISD::TRN2, dl, Op.getValueType(),
3932 Op.getOperand(1), Op.getOperand(2));
3933 case Intrinsic::aarch64_sve_uzp1:
3934 return DAG.getNode(AArch64ISD::UZP1, dl, Op.getValueType(),
3935 Op.getOperand(1), Op.getOperand(2));
3936 case Intrinsic::aarch64_sve_uzp2:
3937 return DAG.getNode(AArch64ISD::UZP2, dl, Op.getValueType(),
3938 Op.getOperand(1), Op.getOperand(2));
3939 case Intrinsic::aarch64_sve_zip1:
3940 return DAG.getNode(AArch64ISD::ZIP1, dl, Op.getValueType(),
3941 Op.getOperand(1), Op.getOperand(2));
3942 case Intrinsic::aarch64_sve_zip2:
3943 return DAG.getNode(AArch64ISD::ZIP2, dl, Op.getValueType(),
3944 Op.getOperand(1), Op.getOperand(2));
3945 case Intrinsic::aarch64_sve_splice:
3946 return DAG.getNode(AArch64ISD::SPLICE, dl, Op.getValueType(),
3947 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3948 case Intrinsic::aarch64_sve_ptrue:
3949 return DAG.getNode(AArch64ISD::PTRUE, dl, Op.getValueType(),
3950 Op.getOperand(1));
3951 case Intrinsic::aarch64_sve_clz:
3952 return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, dl, Op.getValueType(),
3953 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3954 case Intrinsic::aarch64_sve_cnt: {
3955 SDValue Data = Op.getOperand(3);
3956 // CTPOP only supports integer operands.
3957 if (Data.getValueType().isFloatingPoint())
3958 Data = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Data);
3959 return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, dl, Op.getValueType(),
3960 Op.getOperand(2), Data, Op.getOperand(1));
3961 }
3962 case Intrinsic::aarch64_sve_dupq_lane:
3963 return LowerDUPQLane(Op, DAG);
3964 case Intrinsic::aarch64_sve_convert_from_svbool:
3965 return DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, Op.getValueType(),
3966 Op.getOperand(1));
3967 case Intrinsic::aarch64_sve_convert_to_svbool:
3968 return lowerConvertToSVBool(Op, DAG);
3969 case Intrinsic::aarch64_sve_fneg:
3970 return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, dl, Op.getValueType(),
3971 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3972 case Intrinsic::aarch64_sve_frintp:
3973 return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, dl, Op.getValueType(),
3974 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3975 case Intrinsic::aarch64_sve_frintm:
3976 return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, dl, Op.getValueType(),
3977 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3978 case Intrinsic::aarch64_sve_frinti:
3979 return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, dl, Op.getValueType(),
3980 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3981 case Intrinsic::aarch64_sve_frintx:
3982 return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, dl, Op.getValueType(),
3983 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3984 case Intrinsic::aarch64_sve_frinta:
3985 return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, dl, Op.getValueType(),
3986 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3987 case Intrinsic::aarch64_sve_frintn:
3988 return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, dl, Op.getValueType(),
3989 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3990 case Intrinsic::aarch64_sve_frintz:
3991 return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, dl, Op.getValueType(),
3992 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3993 case Intrinsic::aarch64_sve_ucvtf:
3995 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
3996 Op.getOperand(1));
3997 case Intrinsic::aarch64_sve_scvtf:
3999 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
4000 Op.getOperand(1));
4001 case Intrinsic::aarch64_sve_fcvtzu:
4003 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
4004 Op.getOperand(1));
4005 case Intrinsic::aarch64_sve_fcvtzs:
4007 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
4008 Op.getOperand(1));
4009 case Intrinsic::aarch64_sve_fsqrt:
4010 return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, dl, Op.getValueType(),
4011 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4012 case Intrinsic::aarch64_sve_frecpx:
4013 return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, dl, Op.getValueType(),
4014 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4015 case Intrinsic::aarch64_sve_fabs:
4016 return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, dl, Op.getValueType(),
4017 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4018 case Intrinsic::aarch64_sve_abs:
4019 return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, dl, Op.getValueType(),
4020 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4021 case Intrinsic::aarch64_sve_neg:
4022 return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, dl, Op.getValueType(),
4023 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4024 case Intrinsic::aarch64_sve_insr: {
4025 SDValue Scalar = Op.getOperand(2);
4026 EVT ScalarTy = Scalar.getValueType();
4027 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
4028 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
4029
4030 return DAG.getNode(AArch64ISD::INSR, dl, Op.getValueType(),
4031 Op.getOperand(1), Scalar);
4032 }
4033 case Intrinsic::aarch64_sve_rbit:
4035 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
4036 Op.getOperand(1));
4037 case Intrinsic::aarch64_sve_revb:
4038 return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, dl, Op.getValueType(),
4039 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4040 case Intrinsic::aarch64_sve_sxtb:
4041 return DAG.getNode(
4043 Op.getOperand(2), Op.getOperand(3),
4044 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
4045 Op.getOperand(1));
4046 case Intrinsic::aarch64_sve_sxth:
4047 return DAG.getNode(
4049 Op.getOperand(2), Op.getOperand(3),
4050 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
4051 Op.getOperand(1));
4052 case Intrinsic::aarch64_sve_sxtw:
4053 return DAG.getNode(
4055 Op.getOperand(2), Op.getOperand(3),
4056 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
4057 Op.getOperand(1));
4058 case Intrinsic::aarch64_sve_uxtb:
4059 return DAG.getNode(
4061 Op.getOperand(2), Op.getOperand(3),
4062 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
4063 Op.getOperand(1));
4064 case Intrinsic::aarch64_sve_uxth:
4065 return DAG.getNode(
4067 Op.getOperand(2), Op.getOperand(3),
4068 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
4069 Op.getOperand(1));
4070 case Intrinsic::aarch64_sve_uxtw:
4071 return DAG.getNode(
4073 Op.getOperand(2), Op.getOperand(3),
4074 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
4075 Op.getOperand(1));
4076
4077 case Intrinsic::localaddress: {
4078 const auto &MF = DAG.getMachineFunction();
4079 const auto *RegInfo = Subtarget->getRegisterInfo();
4080 unsigned Reg = RegInfo->getLocalAddressRegister(MF);
4081 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg,
4082 Op.getSimpleValueType());
4083 }
4084
4085 case Intrinsic::eh_recoverfp: {
4086 // FIXME: This needs to be implemented to correctly handle highly aligned
4087 // stack objects. For now we simply return the incoming FP. Refer D53541
4088 // for more details.
4089 SDValue FnOp = Op.getOperand(1);
4090 SDValue IncomingFPOp = Op.getOperand(2);
4092 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
4093 if (!Fn)
4095 "llvm.eh.recoverfp must take a function as the first argument");
4096 return IncomingFPOp;
4097 }
4098
4099 case Intrinsic::aarch64_neon_vsri:
4100 case Intrinsic::aarch64_neon_vsli: {
4101 EVT Ty = Op.getValueType();
4102
4103 if (!Ty.isVector())
4104 report_fatal_error("Unexpected type for aarch64_neon_vsli");
4105
4106 assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
4107
4108 bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri;
4109 unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
4110 return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2),
4111 Op.getOperand(3));
4112 }
4113
4114 case Intrinsic::aarch64_neon_srhadd:
4115 case Intrinsic::aarch64_neon_urhadd:
4116 case Intrinsic::aarch64_neon_shadd:
4117 case Intrinsic::aarch64_neon_uhadd: {
4118 bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
4119 IntNo == Intrinsic::aarch64_neon_shadd);
4120 bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
4121 IntNo == Intrinsic::aarch64_neon_urhadd);
4122 unsigned Opcode =
4124 : (IsRoundingAdd ? AArch64ISD::URHADD : AArch64ISD::UHADD);
4125 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
4126 Op.getOperand(2));
4127 }
4128 case Intrinsic::aarch64_neon_sabd:
4129 case Intrinsic::aarch64_neon_uabd: {
4130 unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uabd ? ISD::ABDU
4131 : ISD::ABDS;
4132 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
4133 Op.getOperand(2));
4134 }
4135 case Intrinsic::aarch64_neon_uaddlp: {
4136 unsigned Opcode = AArch64ISD::UADDLP;
4137 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1));
4138 }
4139 case Intrinsic::aarch64_neon_sdot:
4140 case Intrinsic::aarch64_neon_udot:
4141 case Intrinsic::aarch64_sve_sdot:
4142 case Intrinsic::aarch64_sve_udot: {
4143 unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot ||
4144 IntNo == Intrinsic::aarch64_sve_udot)
4146 : AArch64ISD::SDOT;
4147 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
4148 Op.getOperand(2), Op.getOperand(3));
4149 }
4150 }
4151}
4152
4153bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
4154 if (VT.getVectorElementType() == MVT::i8 ||
4156 EltTy = MVT::i32;
4157 return true;
4158 }
4159 return false;
4160}
4161
4162bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(EVT VT) const {
4163 if (VT.getVectorElementType() == MVT::i32 &&
4165 !VT.isFixedLengthVector())
4166 return true;
4167
4168 return false;
4169}
4170
4171bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
4172 return ExtVal.getValueType().isScalableVector();
4173}
4174
4175unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
4176 std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
4177 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
4179 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
4181 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
4183 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
4185 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
4187 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
4189 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
4191 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
4193 };
4194 auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
4195 return AddrModes.find(Key)->second;
4196}
4197
4198unsigned getScatterVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
4199 std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
4200 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
4202 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
4204 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
4206 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
4208 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
4210 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
4212 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
4214 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
4216 };
4217 auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
4218 return AddrModes.find(Key)->second;
4219}
4220
4242
4244 unsigned Opcode = Index.getOpcode();
4245 if (Opcode == ISD::SIGN_EXTEND_INREG)
4246 return true;
4247
4248 if (Opcode == ISD::AND) {
4249 SDValue Splat = Index.getOperand(1);
4250 if (Splat.getOpcode() != ISD::SPLAT_VECTOR)
4251 return false;
4253 if (!Mask || Mask->getZExtValue() != 0xFFFFFFFF)
4254 return false;
4255 return true;
4256 }
4257
4258 return false;
4259}
4260
4261// If the base pointer of a masked gather or scatter is null, we
4262// may be able to swap BasePtr & Index and use the vector + register
4263// or vector + immediate addressing mode, e.g.
4264// VECTOR + REGISTER:
4265// getelementptr nullptr, <vscale x N x T> (splat(%offset)) + %indices)
4266// -> getelementptr %offset, <vscale x N x T> %indices
4267// VECTOR + IMMEDIATE:
4268// getelementptr nullptr, <vscale x N x T> (splat(#x)) + %indices)
4269// -> getelementptr #x, <vscale x N x T> %indices
4271 unsigned &Opcode, bool IsGather,
4272 SelectionDAG &DAG) {
4273 if (!isNullConstant(BasePtr))
4274 return;
4275
4276 // FIXME: This will not match for fixed vector type codegen as the nodes in
4277 // question will have fixed<->scalable conversions around them. This should be
4278 // moved to a DAG combine or complex pattern so that is executes after all of
4279 // the fixed vector insert and extracts have been removed. This deficiency
4280 // will result in a sub-optimal addressing mode being used, i.e. an ADD not
4281 // being folded into the scatter/gather.
4282 ConstantSDNode *Offset = nullptr;
4283 if (Index.getOpcode() == ISD::ADD)
4284 if (auto SplatVal = DAG.getSplatValue(Index.getOperand(1))) {
4287 else {
4288 BasePtr = SplatVal;
4289 Index = Index->getOperand(0);
4290 return;
4291 }
4292 }
4293
4294 unsigned NewOp =
4296
4297 if (!Offset) {
4298 std::swap(BasePtr, Index);
4299 Opcode = NewOp;
4300 return;
4301 }
4302
4303 uint64_t OffsetVal = Offset->getZExtValue();
4304 unsigned ScalarSizeInBytes = MemVT.getScalarSizeInBits() / 8;
4305 auto ConstOffset = DAG.getConstant(OffsetVal, SDLoc(Index), MVT::i64);
4306
4308 // Index is out of range for the immediate addressing mode
4309 BasePtr = ConstOffset;
4310 Index = Index->getOperand(0);
4311 return;
4312 }
4313
4314 // Immediate is in range
4315 Opcode = NewOp;
4316 BasePtr = Index->getOperand(0);
4317 Index = ConstOffset;
4318}
4319
4320SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
4321 SelectionDAG &DAG) const {
4322 SDLoc DL(Op);
4324 assert(MGT && "Can only custom lower gather load nodes");
4325
4326 bool IsFixedLength = MGT->getMemoryVT().isFixedLengthVector();
4327
4328 SDValue Index = MGT->getIndex();
4329 SDValue Chain = MGT->getChain();
4330 SDValue PassThru = MGT->getPassThru();
4331 SDValue Mask = MGT->getMask();
4332 SDValue BasePtr = MGT->getBasePtr();
4333 ISD::LoadExtType ExtTy = MGT->getExtensionType();
4334
4335 ISD::MemIndexType IndexType = MGT->getIndexType();
4336 bool IsScaled =
4337 IndexType == ISD::SIGNED_SCALED || IndexType == ISD::UNSIGNED_SCALED;
4338 bool IsSigned =
4339 IndexType == ISD::SIGNED_SCALED || IndexType == ISD::SIGNED_UNSCALED;
4340 bool IdxNeedsExtend =
4342 Index.getSimpleValueType().getVectorElementType() == MVT::i32;
4343 bool ResNeedsSignExtend = ExtTy == ISD::EXTLOAD || ExtTy == ISD::SEXTLOAD;
4344
4345 EVT VT = PassThru.getSimpleValueType();
4346 EVT IndexVT = Index.getSimpleValueType();
4347 EVT MemVT = MGT->getMemoryVT();
4349
4350 if (VT.getVectorElementType() == MVT::bf16 &&
4351 !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
4352 return SDValue();
4353
4354 if (IsFixedLength) {
4355 assert(Subtarget->useSVEForFixedLengthVectors() &&
4356 "Cannot lower when not using SVE for fixed vectors");
4357 if (MemVT.getScalarSizeInBits() <= IndexVT.getScalarSizeInBits()) {
4359 MemVT = IndexVT.changeVectorElementType(MemVT.getVectorElementType());
4360 } else {
4362 IndexVT = MemVT.changeTypeToInteger();
4363 }
4364 InputVT = DAG.getValueType(MemVT.changeTypeToInteger());
4365 Mask = DAG.getNode(
4367 VT.changeVectorElementType(IndexVT.getVectorElementType()), Mask);
4368 }
4369
4370 if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
4371 PassThru = SDValue();
4372
4373 if (VT.isFloatingPoint() && !IsFixedLength) {
4374 // Handle FP data by using an integer gather and casting the result.
4375 if (PassThru) {
4377 PassThru = getSVESafeBitCast(PassThruVT, PassThru, DAG);
4378 }
4379 InputVT = DAG.getValueType(MemVT.changeVectorElementTypeToInteger());
4380 }
4381
4383
4385 Index = Index.getOperand(0);
4386
4387 unsigned Opcode = getGatherVecOpcode(IsScaled, IsSigned, IdxNeedsExtend);
4388 selectGatherScatterAddrMode(BasePtr, Index, MemVT, Opcode,
4389 /*isGather=*/true, DAG);
4390
4392 Opcode = getSignExtendedGatherOpcode(Opcode);
4393
4394 if (IsFixedLength) {
4395 if (Index.getSimpleValueType().isFixedLengthVector())
4396 Index = convertToScalableVector(DAG, IndexVT, Index);
4397 if (BasePtr.getSimpleValueType().isFixedLengthVector())
4398 BasePtr = convertToScalableVector(DAG, IndexVT, BasePtr);
4400 }
4401
4402 SDValue Ops[] = {Chain, Mask, BasePtr, Index, InputVT};
4403 SDValue Result = DAG.getNode(Opcode, DL, VTs, Ops);
4404 Chain = Result.getValue(1);
4405
4406 if (IsFixedLength) {
4408 DAG, VT.changeVectorElementType(IndexVT.getVectorElementType()),
4409 Result);
4410 Result = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Result);
4411 Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
4412
4413 if (PassThru)
4414 Result = DAG.getSelect(DL, VT, MGT->getMask(), Result, PassThru);
4415 } else {
4416 if (PassThru)
4417 Result = DAG.getSelect(DL, IndexVT, Mask, Result, PassThru);
4418
4419 if (VT.isFloatingPoint())
4420 Result = getSVESafeBitCast(VT, Result, DAG);
4421 }
4422
4423 return DAG.getMergeValues({Result, Chain}, DL);
4424}
4425
4426SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
4427 SelectionDAG &DAG) const {
4428 SDLoc DL(Op);
4430 assert(MSC && "Can only custom lower scatter store nodes");
4431
4432 bool IsFixedLength = MSC->getMemoryVT().isFixedLengthVector();
4433
4434 SDValue Index = MSC->getIndex();
4435 SDValue Chain = MSC->getChain();
4436 SDValue StoreVal = MSC->getValue();
4437 SDValue Mask = MSC->getMask();
4438 SDValue BasePtr = MSC->getBasePtr();
4439
4440 ISD::MemIndexType IndexType = MSC->getIndexType();
4441 bool IsScaled =
4442 IndexType == ISD::SIGNED_SCALED || IndexType == ISD::UNSIGNED_SCALED;
4443 bool IsSigned =
4444 IndexType == ISD::SIGNED_SCALED || IndexType == ISD::SIGNED_UNSCALED;
4445 bool NeedsExtend =
4447 Index.getSimpleValueType().getVectorElementType() == MVT::i32;
4448
4449 EVT VT = StoreVal.getSimpleValueType();
4450 EVT IndexVT = Index.getSimpleValueType();
4451 SDVTList VTs = DAG.getVTList(MVT::Other);
4452 EVT MemVT = MSC->getMemoryVT();
4454
4455 if (VT.getVectorElementType() == MVT::bf16 &&
4456 !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
4457 return SDValue();
4458
4459 if (IsFixedLength) {
4460 assert(Subtarget->useSVEForFixedLengthVectors() &&
4461 "Cannot lower when not using SVE for fixed vectors");
4462 if (MemVT.getScalarSizeInBits() <= IndexVT.getScalarSizeInBits()) {
4464 MemVT = IndexVT.changeVectorElementType(MemVT.getVectorElementType());
4465 } else {
4467 IndexVT = MemVT.changeTypeToInteger();
4468 }
4469 InputVT = DAG.getValueType(MemVT.changeTypeToInteger());
4470
4471 StoreVal =
4473 StoreVal = DAG.getNode(
4475 VT.changeVectorElementType(IndexVT.getVectorElementType()), StoreVal);
4477 Mask = DAG.getNode(
4479 VT.changeVectorElementType(IndexVT.getVectorElementType()), Mask);
4480 } else if (VT.isFloatingPoint()) {
4481 // Handle FP data by casting the data so an integer scatter can be used.
4483 StoreVal = getSVESafeBitCast(StoreValVT, StoreVal, DAG);
4484 InputVT = DAG.getValueType(MemVT.changeVectorElementTypeToInteger());
4485 }
4486
4488 Index = Index.getOperand(0);
4489
4490 unsigned Opcode = getScatterVecOpcode(IsScaled, IsSigned, NeedsExtend);
4491 selectGatherScatterAddrMode(BasePtr, Index, MemVT, Opcode,
4492 /*isGather=*/false, DAG);
4493
4494 if (IsFixedLength) {
4495 if (Index.getSimpleValueType().isFixedLengthVector())
4496 Index = convertToScalableVector(DAG, IndexVT, Index);
4497 if (BasePtr.getSimpleValueType().isFixedLengthVector())
4498 BasePtr = convertToScalableVector(DAG, IndexVT, BasePtr);
4500 }
4501
4502 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, InputVT};
4503 return DAG.getNode(Opcode, DL, VTs, Ops);
4504}
4505
4506SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
4507 SDLoc DL(Op);
4509 assert(LoadNode && "Expected custom lowering of a masked load node");
4510 EVT VT = Op->getValueType(0);
4511
4512 if (useSVEForFixedLengthVectorVT(VT, true))
4513 return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
4514
4515 SDValue PassThru = LoadNode->getPassThru();
4516 SDValue Mask = LoadNode->getMask();
4517
4518 if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
4519 return Op;
4520
4522 VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(),
4523 LoadNode->getOffset(), Mask, DAG.getUNDEF(VT), LoadNode->getMemoryVT(),
4524 LoadNode->getMemOperand(), LoadNode->getAddressingMode(),
4525 LoadNode->getExtensionType());
4526
4527 SDValue Result = DAG.getSelect(DL, VT, Mask, Load, PassThru);
4528
4529 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
4530}
4531
4532// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
4534 EVT VT, EVT MemVT,
4535 SelectionDAG &DAG) {
4536 assert(VT.isVector() && "VT should be a vector type");
4537 assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
4538
4539 SDValue Value = ST->getValue();
4540
4541 // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
4542 // the word lane which represent the v4i8 subvector. It optimizes the store
4543 // to:
4544 //
4545 // xtn v0.8b, v0.8h
4546 // str s0, [x0]
4547
4548 SDValue Undef = DAG.getUNDEF(MVT::i16);
4550 {Undef, Undef, Undef, Undef});
4551
4553 Value, UndefVec);
4555
4556 Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
4558 Trunc, DAG.getConstant(0, DL, MVT::i64));
4559
4560 return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
4561 ST->getBasePtr(), ST->getMemOperand());
4562}
4563
4564// Custom lowering for any store, vector or scalar and/or default or with
4565// a truncate operations. Currently only custom lower truncate operation
4566// from vector v4i16 to v4i8 or volatile stores of i128.
4567SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
4568 SelectionDAG &DAG) const {
4569 SDLoc Dl(Op);
4571 assert (StoreNode && "Can only custom lower store nodes");
4572
4573 SDValue Value = StoreNode->getValue();
4574
4575 EVT VT = Value.getValueType();
4576 EVT MemVT = StoreNode->getMemoryVT();
4577
4578 if (VT.isVector()) {
4579 if (useSVEForFixedLengthVectorVT(VT, true))
4580 return LowerFixedLengthVectorStoreToSVE(Op, DAG);
4581
4582 unsigned AS = StoreNode->getAddressSpace();
4583 Align Alignment = StoreNode->getAlign();
4584 if (Alignment < MemVT.getStoreSize() &&
4585 !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment,
4586 StoreNode->getMemOperand()->getFlags(),
4587 nullptr)) {
4588 return scalarizeVectorStore(StoreNode, DAG);
4589 }
4590
4591 if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
4592 MemVT == MVT::v4i8) {
4593 return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
4594 }
4595 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
4596 // the custom lowering, as there are no un-paired non-temporal stores and
4597 // legalization will break up 256 bit inputs.
4598 ElementCount EC = MemVT.getVectorElementCount();
4599 if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
4600 EC.isKnownEven() &&
4601 ((MemVT.getScalarSizeInBits() == 8u ||
4602 MemVT.getScalarSizeInBits() == 16u ||
4603 MemVT.getScalarSizeInBits() == 32u ||
4604 MemVT.getScalarSizeInBits() == 64u))) {
4605 SDValue Lo =
4607 MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
4608 StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
4609 SDValue Hi =
4611 MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
4612 StoreNode->getValue(),
4613 DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64));
4616 {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
4617 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
4618 return Result;
4619 }
4620 } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
4621 assert(StoreNode->getValue()->getValueType(0) == MVT::i128);
4622 SDValue Lo =
4624 DAG.getConstant(0, Dl, MVT::i64));
4625 SDValue Hi =
4627 DAG.getConstant(1, Dl, MVT::i64));
4630 {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
4631 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
4632 return Result;
4633 } else if (MemVT == MVT::i64x8) {
4634 SDValue Value = StoreNode->getValue();
4635 assert(Value->getValueType(0) == MVT::i64x8);
4636 SDValue Chain = StoreNode->getChain();
4637 SDValue Base = StoreNode->getBasePtr();
4638 EVT PtrVT = Base.getValueType();
4639 for (unsigned i = 0; i < 8; i++) {
4641 Value, DAG.getConstant(i, Dl, MVT::i32));
4642 SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base,
4643 DAG.getConstant(i * 8, Dl, PtrVT));
4644 Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(),
4645 StoreNode->getOriginalAlign());
4646 }
4647 return Chain;
4648 }
4649
4650 return SDValue();
4651}
4652
4653SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
4654 SelectionDAG &DAG) const {
4655 SDLoc DL(Op);
4657 assert(LoadNode && "Expected custom lowering of a load node");
4658
4659 if (LoadNode->getMemoryVT() == MVT::i64x8) {
4661 SDValue Base = LoadNode->getBasePtr();
4662 SDValue Chain = LoadNode->getChain();
4663 EVT PtrVT = Base.getValueType();
4664 for (unsigned i = 0; i < 8; i++) {
4665 SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
4666 DAG.getConstant(i * 8, DL, PtrVT));
4667 SDValue Part = DAG.getLoad(MVT::i64, DL, Chain, Ptr,
4668 LoadNode->getPointerInfo(),
4669 LoadNode->getOriginalAlign());
4670 Ops.push_back(Part);
4671 Chain = SDValue(Part.getNode(), 1);
4672 }
4674 return DAG.getMergeValues({Loaded, Chain}, DL);
4675 }
4676
4677 // Custom lowering for extending v4i8 vector loads.
4678 EVT VT = Op->getValueType(0);
4679 assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32");
4680
4681 if (LoadNode->getMemoryVT() != MVT::v4i8)
4682 return SDValue();
4683
4684 unsigned ExtType;
4685 if (LoadNode->getExtensionType() == ISD::SEXTLOAD)
4686 ExtType = ISD::SIGN_EXTEND;
4687 else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD ||
4688 LoadNode->getExtensionType() == ISD::EXTLOAD)
4689 ExtType = ISD::ZERO_EXTEND;
4690 else
4691 return SDValue();
4692
4693 SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(),
4694 LoadNode->getBasePtr(), MachinePointerInfo());
4695 SDValue Chain = Load.getValue(1);
4697 SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec);
4698 SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC);
4700 DAG.getConstant(0, DL, MVT::i64));
4701 if (VT == MVT::v4i32)
4702 Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext);
4703 return DAG.getMergeValues({Ext, Chain}, DL);
4704}
4705
4706// Generate SUBS and CSEL for integer abs.
4707SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
4708 MVT VT = Op.getSimpleValueType();
4709
4710 if (VT.isVector())
4711 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
4712
4713 SDLoc DL(Op);
4714 SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
4715 Op.getOperand(0));
4716 // Generate SUBS & CSEL.
4717 SDValue Cmp =
4719 Op.getOperand(0), DAG.getConstant(0, DL, VT));
4720 return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
4722 Cmp.getValue(1));
4723}
4724
4726 SelectionDAG &DAG) const {
4727 LLVM_DEBUG(dbgs() << "Custom lowering: ");
4728 LLVM_DEBUG(Op.dump());
4729
4730 switch (Op.getOpcode()) {
4731 default:
4732 llvm_unreachable("unimplemented operand");
4733 return SDValue();
4734 case ISD::BITCAST:
4735 return LowerBITCAST(Op, DAG);
4736 case ISD::GlobalAddress:
4737 return LowerGlobalAddress(Op, DAG);
4739 return LowerGlobalTLSAddress(Op, DAG);
4740 case ISD::SETCC:
4741 case ISD::STRICT_FSETCC:
4743 return LowerSETCC(Op, DAG);
4744 case ISD::BR_CC:
4745 return LowerBR_CC(Op, DAG);
4746 case ISD::SELECT:
4747 return LowerSELECT(Op, DAG);
4748 case ISD::SELECT_CC:
4749 return LowerSELECT_CC(Op, DAG);
4750 case ISD::JumpTable:
4751 return LowerJumpTable(Op, DAG);
4752 case ISD::BR_JT:
4753 return LowerBR_JT(Op, DAG);
4754 case ISD::ConstantPool:
4755 return LowerConstantPool(Op, DAG);
4756 case ISD::BlockAddress:
4757 return LowerBlockAddress(Op, DAG);
4758 case ISD::VASTART:
4759 return LowerVASTART(Op, DAG);
4760 case ISD::VACOPY:
4761 return LowerVACOPY(Op, DAG);
4762 case ISD::VAARG:
4763 return LowerVAARG(Op, DAG);
4764 case ISD::ADDC:
4765 case ISD::ADDE:
4766 case ISD::SUBC:
4767 case ISD::SUBE:
4768 return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
4769 case ISD::SADDO:
4770 case ISD::UADDO:
4771 case ISD::SSUBO:
4772 case ISD::USUBO:
4773 case ISD::SMULO:
4774 case ISD::UMULO:
4775 return LowerXALUO(Op, DAG);
4776 case ISD::FADD:
4777 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
4778 case ISD::FSUB:
4779 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED);
4780 case ISD::FMUL:
4781 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED);
4782 case ISD::FMA:
4783 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
4784 case ISD::FDIV:
4785 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED);
4786 case ISD::FNEG:
4787 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
4788 case ISD::FCEIL:
4789 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU);
4790 case ISD::FFLOOR:
4791 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU);
4792 case ISD::FNEARBYINT:
4793 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
4794 case ISD::FRINT:
4795 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU);
4796 case ISD::FROUND:
4797 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU);
4798 case ISD::FROUNDEVEN:
4799 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
4800 case ISD::FTRUNC:
4801 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU);
4802 case ISD::FSQRT:
4803 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU);
4804 case ISD::FABS:
4805 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU);
4806 case ISD::FP_ROUND:
4808 return LowerFP_ROUND(Op, DAG);
4809 case ISD::FP_EXTEND:
4810 return LowerFP_EXTEND(Op, DAG);
4811 case ISD::FRAMEADDR:
4812 return LowerFRAMEADDR(Op, DAG);
4813 case ISD::SPONENTRY:
4814 return LowerSPONENTRY(Op, DAG);
4815 case ISD::RETURNADDR:
4816 return LowerRETURNADDR(Op, DAG);
4818 return LowerADDROFRETURNADDR(Op, DAG);
4820 return LowerCONCAT_VECTORS(Op, DAG);
4822 return LowerINSERT_VECTOR_ELT(Op, DAG);
4824 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
4825 case ISD::BUILD_VECTOR:
4826 return LowerBUILD_VECTOR(Op, DAG);
4828 return LowerVECTOR_SHUFFLE(Op, DAG);
4829 case ISD::SPLAT_VECTOR:
4830 return LowerSPLAT_VECTOR(Op, DAG);
4832 return LowerEXTRACT_SUBVECTOR(Op, DAG);
4834 return LowerINSERT_SUBVECTOR(Op, DAG);
4835 case ISD::SDIV:
4836 case ISD::UDIV:
4837 return LowerDIV(Op, DAG);
4838 case ISD::SMIN:
4839 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED,
4840 /*OverrideNEON=*/true);
4841 case ISD::UMIN:
4842 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED,
4843 /*OverrideNEON=*/true);
4844 case ISD::SMAX:
4845 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED,
4846 /*OverrideNEON=*/true);
4847 case ISD::UMAX:
4848 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED,
4849 /*OverrideNEON=*/true);
4850 case ISD::SRA:
4851 case ISD::SRL:
4852 case ISD::SHL:
4853 return LowerVectorSRA_SRL_SHL(Op, DAG);
4854 case ISD::SHL_PARTS:
4855 case ISD::SRL_PARTS:
4856 case ISD::SRA_PARTS:
4857 return LowerShiftParts(Op, DAG);
4858 case ISD::CTPOP:
4859 return LowerCTPOP(Op, DAG);
4860 case ISD::FCOPYSIGN:
4861 return LowerFCOPYSIGN(Op, DAG);
4862 case ISD::OR:
4863 return LowerVectorOR(Op, DAG);
4864 case ISD::XOR:
4865 return LowerXOR(Op, DAG);
4866 case ISD::PREFETCH:
4867 return LowerPREFETCH(Op, DAG);
4868 case ISD::SINT_TO_FP:
4869 case ISD::UINT_TO_FP:
4872 return LowerINT_TO_FP(Op, DAG);
4873 case ISD::FP_TO_SINT:
4874 case ISD::FP_TO_UINT:
4877 return LowerFP_TO_INT(Op, DAG);
4880 return LowerFP_TO_INT_SAT(Op, DAG);
4881 case ISD::FSINCOS:
4882 return LowerFSINCOS(Op, DAG);
4883 case ISD::FLT_ROUNDS_:
4884 return LowerFLT_ROUNDS_(Op, DAG);
4885 case ISD::SET_ROUNDING:
4886 return LowerSET_ROUNDING(Op, DAG);
4887 case ISD::MUL:
4888 return LowerMUL(Op, DAG);
4889 case ISD::MULHS:
4890 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED,
4891 /*OverrideNEON=*/true);
4892 case ISD::MULHU:
4893 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED,
4894 /*OverrideNEON=*/true);
4896 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
4897 case ISD::STORE:
4898 return LowerSTORE(Op, DAG);
4899 case ISD::MSTORE:
4900 return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
4901 case ISD::MGATHER:
4902 return LowerMGATHER(Op, DAG);
4903 case ISD::MSCATTER:
4904 return LowerMSCATTER(Op, DAG);
4906 return LowerVECREDUCE_SEQ_FADD(Op, DAG);
4907 case ISD::VECREDUCE_ADD:
4908 case ISD::VECREDUCE_AND:
4909 case ISD::VECREDUCE_OR:
4910 case ISD::VECREDUCE_XOR:
4918 return LowerVECREDUCE(Op, DAG);
4920 return LowerATOMIC_LOAD_SUB(Op, DAG);
4922 return LowerATOMIC_LOAD_AND(Op, DAG);
4924 return LowerDYNAMIC_STACKALLOC(Op, DAG);
4925 case ISD::VSCALE:
4926 return LowerVSCALE(Op, DAG);
4927 case ISD::ANY_EXTEND:
4928 case ISD::SIGN_EXTEND:
4929 case ISD::ZERO_EXTEND:
4930 return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
4932 // Only custom lower when ExtraVT has a legal byte based element type.
4933 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4934 EVT ExtraEltVT = ExtraVT.getVectorElementType();
4935 if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
4937 return SDValue();
4938
4939 return LowerToPredicatedOp(Op, DAG,
4941 }
4942 case ISD::TRUNCATE:
4943 return LowerTRUNCATE(Op, DAG);
4944 case ISD::MLOAD:
4945 return LowerMLOAD(Op, DAG);
4946 case ISD::LOAD:
4947 if (useSVEForFixedLengthVectorVT(Op.getValueType()))
4948 return LowerFixedLengthVectorLoadToSVE(Op, DAG);
4949 return LowerLOAD(Op, DAG);
4950 case ISD::ADD:
4951 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ADD_PRED);
4952 case ISD::AND:
4953 return LowerToScalableOp(Op, DAG);
4954 case ISD::SUB:
4955 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SUB_PRED);
4956 case ISD::FMAXIMUM:
4957 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED);
4958 case ISD::FMAXNUM:
4959 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);
4960 case ISD::FMINIMUM:
4961 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED);
4962 case ISD::FMINNUM:
4963 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);
4964 case ISD::VSELECT:
4965 return LowerFixedLengthVectorSelectToSVE(Op, DAG);
4966 case ISD::ABS:
4967 return LowerABS(Op, DAG);
4968 case ISD::BITREVERSE:
4969 return LowerBitreverse(Op, DAG);
4970 case ISD::BSWAP:
4971 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU);
4972 case ISD::CTLZ:
4973 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU,
4974 /*OverrideNEON=*/true);
4975 case ISD::CTTZ:
4976 return LowerCTTZ(Op, DAG);
4977 case ISD::VECTOR_SPLICE:
4978 return LowerVECTOR_SPLICE(Op, DAG);
4979 }
4980}
4981
4985
4986bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(
4987 EVT VT, bool OverrideNEON) const {
4988 if (!Subtarget->useSVEForFixedLengthVectors())
4989 return false;
4990
4991 if (!VT.isFixedLengthVector())
4992 return false;
4993
4994 // Don't use SVE for vectors we cannot scalarize if required.
4995 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
4996 // Fixed length predicates should be promoted to i8.
4997 // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
4998 case MVT::i1:
4999 default:
5000 return false;
5001 case MVT::i8:
5002 case MVT::i16:
5003 case MVT::i32:
5004 case MVT::i64:
5005 case MVT::f16:
5006 case MVT::f32:
5007 case MVT::f64:
5008 break;
5009 }
5010
5011 // All SVE implementations support NEON sized vectors.
5012 if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
5013 return true;
5014
5015 // Ensure NEON MVTs only belong to a single register class.
5016 if (VT.getFixedSizeInBits() <= 128)
5017 return false;
5018
5019 // Don't use SVE for types that don't fit.
5020 if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
5021 return false;
5022
5023 // TODO: Perhaps an artificial restriction, but worth having whilst getting
5024 // the base fixed length SVE support in place.
5025 if (!VT.isPow2VectorType())
5026 return false;
5027
5028 return true;
5029}
5030
5031//===----------------------------------------------------------------------===//
5032// Calling Convention Implementation
5033//===----------------------------------------------------------------------===//
5034
5035/// Selects the correct CCAssignFn for a given CallingConvention value.
5037 bool IsVarArg) const {
5038 switch (CC) {
5039 default:
5040 report_fatal_error("Unsupported calling convention.");
5042 return CC_AArch64_WebKit_JS;
5043 case CallingConv::GHC:
5044 return CC_AArch64_GHC;
5045 case CallingConv::C:
5046 case CallingConv::Fast:
5049 case CallingConv::Swift:
5051 case CallingConv::Tail:
5052 if (Subtarget->isTargetWindows() && IsVarArg)
5054 if (!Subtarget->isTargetDarwin())
5055 return CC_AArch64_AAPCS;
5056 if (!IsVarArg)
5057 return CC_AArch64_DarwinPCS;
5060 case CallingConv::Win64:
5061 return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS;
5066 return CC_AArch64_AAPCS;
5067 }
5068}
5069
5070CCAssignFn *
5075
5076SDValue AArch64TargetLowering::LowerFormalArguments(
5077 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
5078 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
5079 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
5081 MachineFrameInfo &MFI = MF.getFrameInfo();
5082 bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
5083
5084 // Assign locations to all of the incoming arguments.
5087 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
5088 *DAG.getContext());
5089
5090 // At this point, Ins[].VT may already be promoted to i32. To correctly
5091 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
5092 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
5093 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
5094 // we use a special version of AnalyzeFormalArguments to pass in ValVT and
5095 // LocVT.
5096 unsigned NumArgs = Ins.size();
5098 unsigned CurArgIdx = 0;
5099 for (unsigned i = 0; i != NumArgs; ++i) {
5100 MVT ValVT = Ins[i].VT;
5101 if (Ins[i].isOrigArg()) {
5102 std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
5103 CurArgIdx = Ins[i].getOrigArgIndex();
5104
5105 // Get type of the original argument.
5106 EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
5107 /*AllowUnknown*/ true);
5108 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
5109 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
5110 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
5111 ValVT = MVT::i8;
5112 else if (ActualMVT == MVT::i16)
5113 ValVT = MVT::i16;
5114 }
5115 bool UseVarArgCC = false;
5116 if (IsWin64)
5117 UseVarArgCC = isVarArg;
5118 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);
5119 bool Res =
5120 AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
5121 assert(!Res && "Call operand has unhandled type");
5122 (void)Res;
5123 }
5125 unsigned ExtraArgLocs = 0;
5126 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
5128
5129 if (Ins[i].Flags.isByVal()) {
5130 // Byval is used for HFAs in the PCS, but the system should work in a
5131 // non-compliant manner for larger structs.
5133 int Size = Ins[i].Flags.getByValSize();
5134 unsigned NumRegs = (Size + 7) / 8;
5135
5136 // FIXME: This works on big-endian for composite byvals, which are the common
5137 // case. It should also work for fundamental types too.
5138 unsigned FrameIdx =
5139 MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
5140 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
5141 InVals.push_back(FrameIdxN);
5142
5143 continue;
5144 }
5145
5146 if (Ins[i].Flags.isSwiftAsync())
5148
5149 SDValue ArgValue;
5150 if (VA.isRegLoc()) {
5151 // Arguments stored in registers.
5152 EVT RegVT = VA.getLocVT();
5153 const TargetRegisterClass *RC;
5154
5155 if (RegVT == MVT::i32)
5156 RC = &AArch64::GPR32RegClass;
5157 else if (RegVT == MVT::i64)
5158 RC = &AArch64::GPR64RegClass;
5159 else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
5160 RC = &AArch64::FPR16RegClass;
5161 else if (RegVT == MVT::f32)
5162 RC = &AArch64::FPR32RegClass;
5163 else if (RegVT == MVT::f64 || RegVT.is64BitVector())
5164 RC = &AArch64::FPR64RegClass;
5165 else if (RegVT == MVT::f128 || RegVT.is128BitVector())
5166 RC = &AArch64::FPR128RegClass;
5167 else if (RegVT.isScalableVector() &&
5168 RegVT.getVectorElementType() == MVT::i1)
5169 RC = &AArch64::PPRRegClass;
5170 else if (RegVT.isScalableVector())
5171 RC = &AArch64::ZPRRegClass;
5172 else
5173 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
5174
5175 // Transform the arguments in physical registers into virtual ones.
5176 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
5177 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
5178
5179 // If this is an 8, 16 or 32-bit value, it is really passed promoted
5180 // to 64 bits. Insert an assert[sz]ext to capture this, then
5181 // truncate to the right size.
5182 switch (VA.getLocInfo()) {
5183 default:
5184 llvm_unreachable("Unknown loc info!");
5185 case CCValAssign::Full:
5186 break;
5188 assert(VA.getValVT().isScalableVector() &&
5189 "Only scalable vectors can be passed indirectly");
5190 break;
5191 case CCValAssign::BCvt:
5192 ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
5193 break;
5194 case CCValAssign::AExt:
5195 case CCValAssign::SExt:
5196 case CCValAssign::ZExt:
5197 break;
5199 ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
5200 DAG.getConstant(32, DL, RegVT));
5201 ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
5202 break;
5203 }
5204 } else { // VA.isRegLoc()
5205 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
5206 unsigned ArgOffset = VA.getLocMemOffset();
5207 unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
5208 ? VA.getLocVT().getSizeInBits()
5209 : VA.getValVT().getSizeInBits()) / 8;
5210
5211 uint32_t BEAlign = 0;
5212 if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
5213 !Ins[i].Flags.isInConsecutiveRegs())
5214 BEAlign = 8 - ArgSize;
5215
5216 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
5217
5218 // Create load nodes to retrieve arguments from the stack.
5220
5221 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
5223 MVT MemVT = VA.getValVT();
5224
5225 switch (VA.getLocInfo()) {
5226 default:
5227 break;
5228 case CCValAssign::Trunc:
5229 case CCValAssign::BCvt:
5230 MemVT = VA.getLocVT();
5231 break;
5233 assert(VA.getValVT().isScalableVector() &&
5234 "Only scalable vectors can be passed indirectly");
5235 MemVT = VA.getLocVT();
5236 break;
5237 case CCValAssign::SExt:
5238 ExtType = ISD::SEXTLOAD;
5239 break;
5240 case CCValAssign::ZExt:
5241 ExtType = ISD::ZEXTLOAD;
5242 break;
5243 case CCValAssign::AExt:
5244 ExtType = ISD::EXTLOAD;
5245 break;
5246 }
5247
5248 ArgValue = DAG.getExtLoad(
5249 ExtType, DL, VA.getLocVT(), Chain, FIN,
5251 MemVT);
5252 }
5253
5254 if (VA.getLocInfo() == CCValAssign::Indirect) {
5255 assert(VA.getValVT().isScalableVector() &&
5256 "Only scalable vectors can be passed indirectly");
5257
5258 uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinSize();
5259 unsigned NumParts = 1;
5260 if (Ins[i].Flags.isInConsecutiveRegs()) {
5261 assert(!Ins[i].Flags.isInConsecutiveRegsLast());
5262 while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
5263 ++NumParts;
5264 }
5265
5266 MVT PartLoad = VA.getValVT();
5267 SDValue Ptr = ArgValue;
5268
5269 // Ensure we generate all loads for each tuple part, whilst updating the
5270 // pointer after each load correctly using vscale.
5271 while (NumParts > 0) {
5272 ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo());
5273 InVals.push_back(ArgValue);
5274 NumParts--;
5275 if (NumParts > 0) {
5277 DL, Ptr.getValueType(),
5279 SDNodeFlags Flags;
5280 Flags.setNoUnsignedWrap(true);
5281 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
5282 BytesIncrement, Flags);
5283 ExtraArgLocs++;
5284 i++;
5285 }
5286 }
5287 } else {
5288 if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
5289 ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
5290 ArgValue, DAG.getValueType(MVT::i32));
5291 InVals.push_back(ArgValue);
5292 }
5293 }
5294 assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
5295
5296 // varargs
5298 if (isVarArg) {
5299 if (!Subtarget->isTargetDarwin() || IsWin64) {
5300 // The AAPCS variadic function ABI is identical to the non-variadic
5301 // one. As a result there may be more arguments in registers and we should
5302 // save them for future reference.
5303 // Win64 variadic functions also pass arguments in registers, but all float
5304 // arguments are passed in integer registers.
5305 saveVarArgRegisters(CCInfo, DAG, DL, Chain);
5306 }
5307
5308 // This will point to the next argument passed via stack.
5309 unsigned StackOffset = CCInfo.getNextStackOffset();
5310 // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
5311 StackOffset = alignTo(StackOffset, Subtarget->isTargetILP32() ? 4 : 8);
5312 FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));
5313
5314 if (MFI.hasMustTailInVarArgFunc()) {
5316 RegParmTypes.push_back(MVT::i64);
5317 RegParmTypes.push_back(MVT::f128);
5318 // Compute the set of forwarded registers. The rest are scratch.
5320 FuncInfo->getForwardedMustTailRegParms();
5321 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
5323
5324 // Conservatively forward X8, since it might be used for aggregate return.
5325 if (!CCInfo.isAllocated(AArch64::X8)) {
5326 unsigned X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
5327 Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
5328 }
5329 }
5330 }
5331
5332 // On Windows, InReg pointers must be returned, so record the pointer in a
5333 // virtual register at the start of the function so it can be returned in the
5334 // epilogue.
5335 if (IsWin64) {
5336 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
5337 if (Ins[I].Flags.isInReg()) {
5338 assert(!FuncInfo->getSRetReturnReg());
5339
5340 MVT PtrTy = getPointerTy(DAG.getDataLayout());
5341 Register Reg =
5343 FuncInfo->setSRetReturnReg(Reg);
5344
5345 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
5346 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
5347 break;
5348 }
5349 }
5350 }
5351
5352 unsigned StackArgSize = CCInfo.getNextStackOffset();
5354 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
5355 // This is a non-standard ABI so by fiat I say we're allowed to make full
5356 // use of the stack area to be popped, which must be aligned to 16 bytes in
5357 // any case:
5359
5360 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
5361 // a multiple of 16.
5363
5364 // This realignment carries over to the available bytes below. Our own
5365 // callers will guarantee the space is free by giving an aligned value to
5366 // CALLSEQ_START.
5367 }
5368 // Even if we're not expected to free up the space, it's useful to know how
5369 // much is there while considering tail calls (because we can reuse it).
5371
5372 if (Subtarget->hasCustomCallingConv())
5374
5375 return Chain;
5376}
5377
5378void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
5379 SelectionDAG &DAG,
5380 const SDLoc &DL,
5381 SDValue &Chain) const {
5383 MachineFrameInfo &MFI = MF.getFrameInfo();
5385 auto PtrVT = getPointerTy(DAG.getDataLayout());
5386 bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
5387
5389
5390 static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2,
5391 AArch64::X3, AArch64::X4, AArch64::X5,
5392 AArch64::X6, AArch64::X7 };
5393 static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
5395
5396 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
5397 int GPRIdx = 0;
5398 if (GPRSaveSize != 0) {
5399 if (IsWin64) {
5400 GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
5401 if (GPRSaveSize & 15)
5402 // The extra size here, if triggered, will always be 8.
5403 MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
5404 } else
5405 GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
5406
5407 SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
5408
5409 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
5410 unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
5411 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
5412 SDValue Store = DAG.getStore(
5413 Val.getValue(1), DL, Val, FIN,
5414 IsWin64
5416 GPRIdx,
5417 (i - FirstVariadicGPR) * 8)
5418 : MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8));
5419 MemOps.push_back(Store);
5420 FIN =
5421 DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
5422 }
5423 }
5424 FuncInfo->setVarArgsGPRIndex(GPRIdx);
5425 FuncInfo->setVarArgsGPRSize(GPRSaveSize);
5426
5427 if (Subtarget->hasFPARMv8() && !IsWin64) {
5428 static const MCPhysReg FPRArgRegs[] = {
5429 AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
5430 AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
5431 static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
5433
5434 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
5435 int FPRIdx = 0;
5436 if (FPRSaveSize != 0) {
5437 FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);
5438
5440
5441 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
5442 unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
5443 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
5444
5445 SDValue Store = DAG.getStore(
5446 Val.getValue(1), DL, Val, FIN,
5448 MemOps.push_back(Store);
5449 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
5450 DAG.getConstant(16, DL, PtrVT));
5451 }
5452 }
5453 FuncInfo->setVarArgsFPRIndex(FPRIdx);
5454 FuncInfo->setVarArgsFPRSize(FPRSaveSize);
5455 }
5456
5457 if (!MemOps.empty()) {
5459 }
5460}
5461
5462/// LowerCallResult - Lower the result values of a call into the
5463/// appropriate copies out of appropriate physical registers.
5464SDValue AArch64TargetLowering::LowerCallResult(
5465 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
5466 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
5468 SDValue ThisVal) const {
5470 // Assign locations to each value returned by this call.
5473 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
5474 *DAG.getContext());
5475 CCInfo.AnalyzeCallResult(Ins, RetCC);
5476
5477 // Copy all of the result registers out of their specified physreg.
5478 for (unsigned i = 0; i != RVLocs.size(); ++i) {
5480
5481 // Pass 'this' value directly from the argument to return value, to avoid
5482 // reg unit interference
5483 if (i == 0 && isThisReturn) {
5484 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
5485 "unexpected return calling convention register assignment");
5486 InVals.push_back(ThisVal);
5487 continue;
5488 }
5489
5490 // Avoid copying a physreg twice since RegAllocFast is incompetent and only
5491 // allows one use of a physreg per block.
5492 SDValue Val = CopiedRegs.lookup(VA.getLocReg());
5493 if (!Val) {
5494 Val =
5495 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
5496 Chain = Val.getValue(1);
5497 InFlag = Val.getValue(2);
5498 CopiedRegs[VA.getLocReg()] = Val;
5499 }
5500
5501 switch (VA.getLocInfo()) {
5502 default:
5503 llvm_unreachable("Unknown loc info!");
5504 case CCValAssign::Full:
5505 break;
5506 case CCValAssign::BCvt:
5507 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
5508 break;
5510 Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
5511 DAG.getConstant(32, DL, VA.getLocVT()));
5513 case CCValAssign::AExt:
5515 case CCValAssign::ZExt:
5516 Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
5517 break;
5518 }
5519
5520 InVals.push_back(Val);
5521 }
5522
5523 return Chain;
5524}
5525
5526/// Return true if the calling convention is one that we can guarantee TCO for.
5531
5532/// Return true if we might ever do TCO for calls with this calling convention.
5534 switch (CC) {
5535 case CallingConv::C:
5538 case CallingConv::Swift:
5540 case CallingConv::Tail:
5541 case CallingConv::Fast:
5542 return true;
5543 default:
5544 return false;
5545 }
5546}
5547
5548bool AArch64TargetLowering::isEligibleForTailCallOptimization(
5549 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
5551 const SmallVectorImpl<SDValue> &OutVals,
5552 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
5554 return false;
5555
5557 const Function &CallerF = MF.getFunction();
5558 CallingConv::ID CallerCC = CallerF.getCallingConv();
5559
5560 // Functions using the C or Fast calling convention that have an SVE signature
5561 // preserve more registers and should assume the SVE_VectorCall CC.
5562 // The check for matching callee-saved regs will determine whether it is
5563 // eligible for TCO.
5567
5568 bool CCMatch = CallerCC == CalleeCC;
5569
5570 // When using the Windows calling convention on a non-windows OS, we want
5571 // to back up and restore X18 in such functions; we can't do a tail call
5572 // from those functions.
5573 if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
5575 return false;
5576
5577 // Byval parameters hand the function a pointer directly into the stack area
5578 // we want to reuse during a tail call. Working around this *is* possible (see
5579 // X86) but less efficient and uglier in LowerCall.
5580 for (Function::const_arg_iterator i = CallerF.arg_begin(),
5581 e = CallerF.arg_end();
5582 i != e; ++i) {
5583 if (i->hasByValAttr())
5584 return false;
5585
5586 // On Windows, "inreg" attributes signify non-aggregate indirect returns.
5587 // In this case, it is necessary to save/restore X0 in the callee. Tail
5588 // call opt interferes with this. So we disable tail call opt when the
5589 // caller has an argument with "inreg" attribute.
5590
5591 // FIXME: Check whether the callee also has an "inreg" argument.
5592 if (i->hasInRegAttr())
5593 return false;
5594 }
5595
5596 if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
5597 return CCMatch;
5598
5599 // Externally-defined functions with weak linkage should not be
5600 // tail-called on AArch64 when the OS does not support dynamic
5601 // pre-emption of symbols, as the AAELF spec requires normal calls
5602 // to undefined weak functions to be replaced with a NOP or jump to the
5603 // next instruction. The behaviour of branch instructions in this
5604 // situation (as used for tail calls) is implementation-defined, so we
5605 // cannot rely on the linker replacing the tail call with a return.
5607 const GlobalValue *GV = G->getGlobal();
5609 if (GV->hasExternalWeakLinkage() &&
5610 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
5611 return false;
5612 }
5613
5614 // Now we search for cases where we can use a tail call without changing the
5615 // ABI. Sibcall is used in some places (particularly gcc) to refer to this
5616 // concept.
5617
5618 // I want anyone implementing a new calling convention to think long and hard
5619 // about this assert.
5620 assert((!isVarArg || CalleeCC == CallingConv::C) &&
5621 "Unexpected variadic calling convention");
5622
5623 LLVMContext &C = *DAG.getContext();
5624 if (isVarArg && !Outs.empty()) {
5625 // At least two cases here: if caller is fastcc then we can't have any
5626 // memory arguments (we'd be expected to clean up the stack afterwards). If
5627 // caller is C then we could potentially use its argument area.
5628
5629 // FIXME: for now we take the most conservative of these in both cases:
5630 // disallow all variadic memory operands.
5632 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
5633
5634 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
5635 for (const CCValAssign &ArgLoc : ArgLocs)
5636 if (!ArgLoc.isRegLoc())
5637 return false;
5638 }
5639
5640 // Check that the call results are passed in the same way.
5642 CCAssignFnForCall(CalleeCC, isVarArg),
5643 CCAssignFnForCall(CallerCC, isVarArg)))
5644 return false;
5645 // The callee has to preserve all registers the caller needs to preserve.
5646 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
5647 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
5648 if (!CCMatch) {
5649 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
5650 if (Subtarget->hasCustomCallingConv()) {
5651 TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
5652 TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
5653 }
5654 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
5655 return false;
5656 }
5657
5658 // Nothing more to check if the callee is taking no arguments
5659 if (Outs.empty())
5660 return true;
5661
5663 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
5664
5665 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
5666
5667 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
5668
5669 // If any of the arguments is passed indirectly, it must be SVE, so the
5670 // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
5671 // allocate space on the stack. That is why we determine this explicitly here
5672 // the call cannot be a tailcall.
5673 if (llvm::any_of(ArgLocs, [](CCValAssign &A) {
5674 assert((A.getLocInfo() != CCValAssign::Indirect ||
5675 A.getValVT().isScalableVector()) &&
5676 "Expected value to be scalable");
5677 return A.getLocInfo() == CCValAssign::Indirect;
5678 }))
5679 return false;
5680
5681 // If the stack arguments for this call do not fit into our own save area then
5682 // the call cannot be made tail.
5683 if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
5684 return false;
5685
5686 const MachineRegisterInfo &MRI = MF.getRegInfo();
5688 return false;
5689
5690 return true;
5691}
5692
5693SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
5694 SelectionDAG &DAG,
5695 MachineFrameInfo &MFI,
5696 int ClobberedFI) const {
5698 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
5699 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
5700
5701 // Include the original chain at the beginning of the list. When this is
5702 // used by target LowerCall hooks, this helps legalize find the
5703 // CALLSEQ_BEGIN node.
5704 ArgChains.push_back(Chain);
5705
5706 // Add a chain value for each stack argument corresponding
5708 UE = DAG.getEntryNode().getNode()->use_end();
5709 U != UE; ++U)
5710 if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
5711 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
5712 if (FI->getIndex() < 0) {
5713 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
5714 int64_t InLastByte = InFirstByte;
5715 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
5716
5717 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
5718 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
5719 ArgChains.push_back(SDValue(L, 1));
5720 }
5721
5722 // Build a tokenfactor for all the chains.
5723 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
5724}
5725
5726bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
5727 bool TailCallOpt) const {
5728 return (CallCC == CallingConv::Fast && TailCallOpt) ||
5730}
5731
5732/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
5733/// and add input and output parameter nodes.
5734SDValue
5735AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
5736 SmallVectorImpl<SDValue> &InVals) const {
5737 SelectionDAG &DAG = CLI.DAG;
5738 SDLoc &DL = CLI.DL;
5739 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
5740 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
5742 SDValue Chain = CLI.Chain;
5743 SDValue Callee = CLI.Callee;
5744 bool &IsTailCall = CLI.IsTailCall;
5745 CallingConv::ID CallConv = CLI.CallConv;
5746 bool IsVarArg = CLI.IsVarArg;
5747
5750 bool IsThisReturn = false;
5751
5754 bool IsSibCall = false;
5755 bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CallConv);
5756
5757 // Check callee args/returns for SVE registers and set calling convention
5758 // accordingly.
5759 if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) {
5760 bool CalleeOutSVE = any_of(Outs, [](ISD::OutputArg &Out){
5761 return Out.VT.isScalableVector();
5762 });
5763 bool CalleeInSVE = any_of(Ins, [](ISD::InputArg &In){
5764 return In.VT.isScalableVector();
5765 });
5766
5769 }
5770
5771 if (IsTailCall) {
5772 // Check if it's really possible to do a tail call.
5773 IsTailCall = isEligibleForTailCallOptimization(
5774 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
5775
5776 // A sibling call is one where we're under the usual C ABI and not planning
5777 // to change that but can still do a tail call:
5778 if (!TailCallOpt && IsTailCall && CallConv != CallingConv::Tail &&
5779 CallConv != CallingConv::SwiftTail)
5780 IsSibCall = true;
5781
5782 if (IsTailCall)
5783 ++NumTailCalls;
5784 }
5785
5786 if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
5787 report_fatal_error("failed to perform tail call elimination on a call "
5788 "site marked musttail");
5789
5790 // Analyze operands of the call, assigning locations to each operand.
5792 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
5793 *DAG.getContext());
5794
5795 if (IsVarArg) {
5796 // Handle fixed and variable vector arguments differently.
5797 // Variable vector arguments always go into memory.
5798 unsigned NumArgs = Outs.size();
5799
5800 for (unsigned i = 0; i != NumArgs; ++i) {
5801 MVT ArgVT = Outs[i].VT;
5802 if (!Outs[i].IsFixed && ArgVT.isScalableVector())
5803 report_fatal_error("Passing SVE types to variadic functions is "
5804 "currently not supported");
5805
5806 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
5807 bool UseVarArgCC = !Outs[i].IsFixed;
5808 // On Windows, the fixed arguments in a vararg call are passed in GPRs
5809 // too, so use the vararg CC to force them to integer registers.
5810 if (IsCalleeWin64)
5811 UseVarArgCC = true;
5812 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);
5813 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
5814 assert(!Res && "Call operand has unhandled type");
5815 (void)Res;
5816 }
5817 } else {
5818 // At this point, Outs[].VT may already be promoted to i32. To correctly
5819 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
5820 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
5821 // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
5822 // we use a special version of AnalyzeCallOperands to pass in ValVT and
5823 // LocVT.
5824 unsigned NumArgs = Outs.size();
5825 for (unsigned i = 0; i != NumArgs; ++i) {
5826 MVT ValVT = Outs[i].VT;
5827 // Get type of the original argument.
5829 CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
5830 /*AllowUnknown*/ true);
5831 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
5832 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
5833 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
5834 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
5835 ValVT = MVT::i8;
5836 else if (ActualMVT == MVT::i16)
5837 ValVT = MVT::i16;
5838
5839 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
5840 bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo);
5841 assert(!Res && "Call operand has unhandled type");
5842 (void)Res;
5843 }
5844 }
5845
5846 // Get a count of how many bytes are to be pushed on the stack.
5847 unsigned NumBytes = CCInfo.getNextStackOffset();
5848
5849 if (IsSibCall) {
5850 // Since we're not changing the ABI to make this a tail call, the memory
5851 // operands are already available in the caller's incoming argument space.
5852 NumBytes = 0;
5853 }
5854
5855 // FPDiff is the byte offset of the call's argument area from the callee's.
5856 // Stores to callee stack arguments will be placed in FixedStackSlots offset
5857 // by this amount for a tail call. In a sibling call it must be 0 because the
5858 // caller will deallocate the entire stack and the callee still expects its
5859 // arguments to begin at SP+0. Completely unused for non-tail calls.
5860 int FPDiff = 0;
5861
5862 if (IsTailCall && !IsSibCall) {
5863 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
5864
5865 // Since callee will pop argument stack as a tail call, we must keep the
5866 // popped size 16-byte aligned.
5867 NumBytes = alignTo(NumBytes, 16);
5868
5869 // FPDiff will be negative if this tail call requires more space than we
5870 // would automatically have in our incoming argument space. Positive if we
5871 // can actually shrink the stack.
5872 FPDiff = NumReusableBytes - NumBytes;
5873
5874 // Update the required reserved area if this is the tail call requiring the
5875 // most argument stack space.
5876 if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff)
5877 FuncInfo->setTailCallReservedStack(-FPDiff);
5878
5879 // The stack pointer must be 16-byte aligned at all times it's used for a
5880 // memory operation, which in practice means at *all* times and in
5881 // particular across call boundaries. Therefore our own arguments started at
5882 // a 16-byte aligned SP and the delta applied for the tail call should
5883 // satisfy the same constraint.
5884 assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
5885 }
5886
5887 // Adjust the stack pointer for the new arguments...
5888 // These operations are automatically eliminated by the prolog/epilog pass
5889 if (!IsSibCall)
5890 Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL);
5891
5892 SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
5894
5898 auto PtrVT = getPointerTy(DAG.getDataLayout());
5899
5900 if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
5901 const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
5902 for (const auto &F : Forwards) {
5903 SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
5904 RegsToPass.emplace_back(F.PReg, Val);
5905 }
5906 }
5907
5908 // Walk the register/memloc assignments, inserting copies/loads.
5909 unsigned ExtraArgLocs = 0;
5910 for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
5912 SDValue Arg = OutVals[i];
5913 ISD::ArgFlagsTy Flags = Outs[i].Flags;
5914
5915 // Promote the value if needed.
5916 switch (VA.getLocInfo()) {
5917 default:
5918 llvm_unreachable("Unknown loc info!");
5919 case CCValAssign::Full:
5920 break;
5921 case CCValAssign::SExt:
5922 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
5923 break;
5924 case CCValAssign::ZExt:
5925 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
5926 break;
5927 case CCValAssign::AExt:
5928 if (Outs[i].ArgVT == MVT::i1) {
5929 // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
5932 }
5933 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
5934 break;
5936 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
5937 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
5938 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
5939 DAG.getConstant(32, DL, VA.getLocVT()));
5940 break;
5941 case CCValAssign::BCvt:
5942 Arg = DAG.getBitcast(VA.getLocVT(), Arg);
5943 break;
5944 case CCValAssign::Trunc:
5945 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
5946 break;
5947 case CCValAssign::FPExt:
5948 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
5949 break;
5951 assert(VA.getValVT().isScalableVector() &&
5952 "Only scalable vectors can be passed indirectly");
5953
5954 uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinSize();
5955 uint64_t PartSize = StoreSize;
5956 unsigned NumParts = 1;
5957 if (Outs[i].Flags.isInConsecutiveRegs()) {
5958 assert(!Outs[i].Flags.isInConsecutiveRegsLast());
5959 while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
5960 ++NumParts;
5961 StoreSize *= NumParts;
5962 }
5963
5965 Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
5966 Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
5967 int FI = MFI.CreateStackObject(StoreSize, Alignment, false);
5969
5970 MachinePointerInfo MPI =
5972 SDValue Ptr = DAG.getFrameIndex(
5974 SDValue SpillSlot = Ptr;
5975
5976 // Ensure we generate all stores for each tuple part, whilst updating the
5977 // pointer after each store correctly using vscale.
5978 while (NumParts) {
5979 Chain = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI);
5980 NumParts--;
5981 if (NumParts > 0) {
5983 DL, Ptr.getValueType(),
5985 SDNodeFlags Flags;
5986 Flags.setNoUnsignedWrap(true);
5987
5988 MPI = MachinePointerInfo(MPI.getAddrSpace());
5989 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
5990 BytesIncrement, Flags);
5991 ExtraArgLocs++;
5992 i++;
5993 }
5994 }
5995
5996 Arg = SpillSlot;
5997 break;
5998 }
5999
6000 if (VA.isRegLoc()) {
6001 if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
6002 Outs[0].VT == MVT::i64) {
6003 assert(VA.getLocVT() == MVT::i64 &&
6004 "unexpected calling convention register assignment");
6005 assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
6006 "unexpected use of 'returned'");
6007 IsThisReturn = true;
6008 }
6009 if (RegsUsed.count(VA.getLocReg())) {
6010 // If this register has already been used then we're trying to pack
6011 // parts of an [N x i32] into an X-register. The extension type will
6012 // take care of putting the two halves in the right place but we have to
6013 // combine them.
6014 SDValue &Bits =
6016 [=](const std::pair<unsigned, SDValue> &Elt) {
6017 return Elt.first == VA.getLocReg();
6018 })
6019 ->second;
6020 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
6021 // Call site info is used for function's parameter entry value
6022 // tracking. For now we track only simple cases when parameter
6023 // is transferred through whole register.
6025 return ArgReg.Reg == VA.getLocReg();
6026 });
6027 } else {
6028 RegsToPass.emplace_back(VA.getLocReg(), Arg);
6029 RegsUsed.insert(VA.getLocReg());
6030 const TargetOptions &Options = DAG.getTarget().Options;
6031 if (Options.EmitCallSiteInfo)
6032 CSInfo.emplace_back(VA.getLocReg(), i);
6033 }
6034 } else {
6035 assert(VA.isMemLoc());
6036
6039
6040 // FIXME: This works on big-endian for composite byvals, which are the
6041 // common case. It should also work for fundamental types too.
6042 uint32_t BEAlign = 0;
6043 unsigned OpSize;
6044 if (VA.getLocInfo() == CCValAssign::Indirect ||
6045 VA.getLocInfo() == CCValAssign::Trunc)
6046 OpSize = VA.getLocVT().getFixedSizeInBits();
6047 else
6048 OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
6049 : VA.getValVT().getSizeInBits();
6050 OpSize = (OpSize + 7) / 8;
6051 if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
6052 !Flags.isInConsecutiveRegs()) {
6053 if (OpSize < 8)
6054 BEAlign = 8 - OpSize;
6055 }
6056 unsigned LocMemOffset = VA.getLocMemOffset();
6057 int32_t Offset = LocMemOffset + BEAlign;
6059 PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
6060
6061 if (IsTailCall) {
6062 Offset = Offset + FPDiff;
6063 int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
6064
6065 DstAddr = DAG.getFrameIndex(FI, PtrVT);
6066 DstInfo =
6068
6069 // Make sure any stack arguments overlapping with where we're storing
6070 // are loaded before this eventual operation. Otherwise they'll be
6071 // clobbered.
6072 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
6073 } else {
6075
6076 DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
6078 LocMemOffset);
6079 }
6080
6081 if (Outs[i].Flags.isByVal()) {
6083 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
6084 SDValue Cpy = DAG.getMemcpy(
6085 Chain, DL, DstAddr, Arg, SizeNode,
6086 Outs[i].Flags.getNonZeroByValAlign(),
6087 /*isVol = */ false, /*AlwaysInline = */ false,
6088 /*isTailCall = */ false, DstInfo, MachinePointerInfo());
6089
6090 MemOpChains.push_back(Cpy);
6091 } else {
6092 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
6093 // promoted to a legal register type i32, we should truncate Arg back to
6094 // i1/i8/i16.
6095 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
6096 VA.getValVT() == MVT::i16)
6097 Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
6098
6099 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
6100 MemOpChains.push_back(Store);
6101 }
6102 }
6103 }
6104
6105 if (!MemOpChains.empty())
6107
6108 // Build a sequence of copy-to-reg nodes chained together with token chain
6109 // and flag operands which copy the outgoing args into the appropriate regs.
6111 for (auto &RegToPass : RegsToPass) {
6112 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
6113 RegToPass.second, InFlag);
6114 InFlag = Chain.getValue(1);
6115 }
6116
6117 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
6118 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
6119 // node so that legalize doesn't hack it.
6120 if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
6121 auto GV = G->getGlobal();
6122 unsigned OpFlags =
6124 if (OpFlags & AArch64II::MO_GOT) {
6127 } else {
6128 const GlobalValue *GV = G->getGlobal();
6129 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
6130 }
6131 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
6133 Subtarget->isTargetMachO()) {
6134 const char *Sym = S->getSymbol();
6137 } else {
6138 const char *Sym = S->getSymbol();
6139 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
6140 }
6141 }
6142
6143 // We don't usually want to end the call-sequence here because we would tidy
6144 // the frame up *after* the call, however in the ABI-changing tail-call case
6145 // we've carefully laid out the parameters so that when sp is reset they'll be
6146 // in the correct location.
6147 if (IsTailCall && !IsSibCall) {
6148 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
6149 DAG.getIntPtrConstant(0, DL, true), InFlag, DL);
6150 InFlag = Chain.getValue(1);
6151 }
6152
6153 std::vector<SDValue> Ops;
6154 Ops.push_back(Chain);
6155 Ops.push_back(Callee);
6156
6157 if (IsTailCall) {
6158 // Each tail call may have to adjust the stack by a different amount, so
6159 // this information must travel along with the operation for eventual
6160 // consumption by emitEpilogue.
6161 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
6162 }
6163
6164 // Add argument registers to the end of the list so that they are known live
6165 // into the call.
6166 for (auto &RegToPass : RegsToPass)
6167 Ops.push_back(DAG.getRegister(RegToPass.first,
6168 RegToPass.second.getValueType()));
6169
6170 // Add a register mask operand representing the call-preserved registers.
6171 const uint32_t *Mask;
6172 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
6173 if (IsThisReturn) {
6174 // For 'this' returns, use the X0-preserving mask if applicable
6175 Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
6176 if (!Mask) {
6177 IsThisReturn = false;
6178 Mask = TRI->getCallPreservedMask(MF, CallConv);
6179 }
6180 } else
6181 Mask = TRI->getCallPreservedMask(MF, CallConv);
6182
6183 if (Subtarget->hasCustomCallingConv())
6184 TRI->UpdateCustomCallPreservedMask(MF, &Mask);
6185
6186 if (TRI->isAnyArgRegReserved(MF))
6187 TRI->emitReservedArgRegCallError(MF);
6188
6189 assert(Mask && "Missing call preserved mask for calling convention");
6190 Ops.push_back(DAG.getRegisterMask(Mask));
6191
6192 if (InFlag.getNode())
6193 Ops.push_back(InFlag);
6194
6196
6197 // If we're doing a tall call, use a TC_RETURN here rather than an
6198 // actual call instruction.
6199 if (IsTailCall) {
6202 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
6203 return Ret;
6204 }
6205
6206 unsigned CallOpc = AArch64ISD::CALL;
6207 // Calls with operand bundle "clang.arc.attachedcall" are special. They should
6208 // be expanded to the call, directly followed by a special marker sequence.
6209 // Use the CALL_RVMARKER to do that.
6210 if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
6211 assert(!IsTailCall &&
6212 "tail calls cannot be marked with clang.arc.attachedcall");
6214 }
6215
6216 // Returns a chain and a flag for retval copy to use.
6217 Chain = DAG.getNode(CallOpc, DL, NodeTys, Ops);
6218 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
6219 InFlag = Chain.getValue(1);
6220 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
6221
6222 uint64_t CalleePopBytes =
6223 DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
6224
6225 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
6227 InFlag, DL);
6228 if (!Ins.empty())
6229 InFlag = Chain.getValue(1);
6230
6231 // Handle result values, copying them out of physregs into vregs that we
6232 // return.
6233 return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
6234 InVals, IsThisReturn,
6235 IsThisReturn ? OutVals[0] : SDValue());
6236}
6237
6238bool AArch64TargetLowering::CanLowerReturn(
6239 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
6240 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
6243 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
6244 return CCInfo.CheckReturn(Outs, RetCC);
6245}
6246
6247SDValue
6248AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
6249 bool isVarArg,
6251 const SmallVectorImpl<SDValue> &OutVals,
6252 const SDLoc &DL, SelectionDAG &DAG) const {
6253 auto &MF = DAG.getMachineFunction();
6254 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
6255
6258 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
6259 *DAG.getContext());
6260 CCInfo.AnalyzeReturn(Outs, RetCC);
6261
6262 // Copy the result values into the output registers.
6263 SDValue Flag;
6266 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
6267 ++i, ++realRVLocIdx) {
6268 CCValAssign &VA = RVLocs[i];
6269 assert(VA.isRegLoc() && "Can only return in registers!");
6270 SDValue Arg = OutVals[realRVLocIdx];
6271
6272 switch (VA.getLocInfo()) {
6273 default:
6274 llvm_unreachable("Unknown loc info!");
6275 case CCValAssign::Full:
6276 if (Outs[i].ArgVT == MVT::i1) {
6277 // AAPCS requires i1 to be zero-extended to i8 by the producer of the
6278 // value. This is strictly redundant on Darwin (which uses "zeroext
6279 // i1"), but will be optimised out before ISel.
6281 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
6282 }
6283 break;
6284 case CCValAssign::BCvt:
6285 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
6286 break;
6287 case CCValAssign::AExt:
6288 case CCValAssign::ZExt:
6289 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
6290 break;
6292 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
6293 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
6294 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
6295 DAG.getConstant(32, DL, VA.getLocVT()));
6296 break;
6297 }
6298
6299 if (RegsUsed.count(VA.getLocReg())) {
6300 SDValue &Bits =
6301 llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) {
6302 return Elt.first == VA.getLocReg();
6303 })->second;
6304 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
6305 } else {
6306 RetVals.emplace_back(VA.getLocReg(), Arg);
6307 RegsUsed.insert(VA.getLocReg());
6308 }
6309 }
6310
6312 for (auto &RetVal : RetVals) {
6313 Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Flag);
6314 Flag = Chain.getValue(1);
6315 RetOps.push_back(
6316 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
6317 }
6318
6319 // Windows AArch64 ABIs require that for returning structs by value we copy
6320 // the sret argument into X0 for the return.
6321 // We saved the argument into a virtual register in the entry block,
6322 // so now we copy the value out and into X0.
6323 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
6324 SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
6326
6327 unsigned RetValReg = AArch64::X0;
6328 Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Flag);
6329 Flag = Chain.getValue(1);
6330
6331 RetOps.push_back(
6333 }
6334
6335 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
6336 const MCPhysReg *I =
6337 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
6338 if (I) {
6339 for (; *I; ++I) {
6340 if (AArch64::GPR64RegClass.contains(*I))
6341 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
6342 else if (AArch64::FPR64RegClass.contains(*I))
6343 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
6344 else
6345 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
6346 }
6347 }
6348
6349 RetOps[0] = Chain; // Update chain.
6350
6351 // Add the flag if we have it.
6352 if (Flag.getNode())
6353 RetOps.push_back(Flag);
6354
6356}
6357
6358//===----------------------------------------------------------------------===//
6359// Other Lowering Code
6360//===----------------------------------------------------------------------===//
6361
6362SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
6363 SelectionDAG &DAG,
6364 unsigned Flag) const {
6365 return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
6366 N->getOffset(), Flag);
6367}
6368
6369SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
6370 SelectionDAG &DAG,
6371 unsigned Flag) const {
6372 return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
6373}
6374
6375SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
6376 SelectionDAG &DAG,
6377 unsigned Flag) const {
6378 return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
6379 N->getOffset(), Flag);
6380}
6381
6382SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
6383 SelectionDAG &DAG,
6384 unsigned Flag) const {
6385 return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
6386}
6387
6388// (loadGOT sym)
6389template <class NodeTy>
6390SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
6391 unsigned Flags) const {
6392 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
6393 SDLoc DL(N);
6394 EVT Ty = getPointerTy(DAG.getDataLayout());
6395 SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
6396 // FIXME: Once remat is capable of dealing with instructions with register
6397 // operands, expand this into two nodes instead of using a wrapper node.
6398 return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
6399}
6400
6401// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
6402template <class NodeTy>
6403SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
6404 unsigned Flags) const {
6405 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
6406 SDLoc DL(N);
6407 EVT Ty = getPointerTy(DAG.getDataLayout());
6408 const unsigned char MO_NC = AArch64II::MO_NC;
6409 return DAG.getNode(
6411 getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
6412 getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
6413 getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
6414 getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
6415}
6416
6417// (addlow (adrp %hi(sym)) %lo(sym))
6418template <class NodeTy>
6419SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
6420 unsigned Flags) const {
6421 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
6422 SDLoc DL(N);
6423 EVT Ty = getPointerTy(DAG.getDataLayout());
6424 SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
6425 SDValue Lo = getTargetNode(N, Ty, DAG,
6427 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi);
6428 return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
6429}
6430
6431// (adr sym)
6432template <class NodeTy>
6433SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
6434 unsigned Flags) const {
6435 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
6436 SDLoc DL(N);
6437 EVT Ty = getPointerTy(DAG.getDataLayout());
6438 SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
6439 return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
6440}
6441
6442SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
6443 SelectionDAG &DAG) const {
6445 const GlobalValue *GV = GN->getGlobal();
6446 unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
6447
6450 "unexpected offset in global node");
6451
6452 // This also catches the large code model case for Darwin, and tiny code
6453 // model with got relocations.
6454 if ((OpFlags & AArch64II::MO_GOT) != 0) {
6455 return getGOT(GN, DAG, OpFlags);
6456 }
6457
6460 Result = getAddrLarge(GN, DAG, OpFlags);
6461 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
6462 Result = getAddrTiny(GN, DAG, OpFlags);
6463 } else {
6464 Result = getAddr(GN, DAG, OpFlags);
6465 }
6467 SDLoc DL(GN);
6469 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
6471 return Result;
6472}
6473
6474/// Convert a TLS address reference into the correct sequence of loads
6475/// and calls to compute the variable's address (for Darwin, currently) and
6476/// return an SDValue containing the final node.
6477
6478/// Darwin only has one TLS scheme which must be capable of dealing with the
6479/// fully general situation, in the worst case. This means:
6480/// + "extern __thread" declaration.
6481/// + Defined in a possibly unknown dynamic library.
6482///
6483/// The general system is that each __thread variable has a [3 x i64] descriptor
6484/// which contains information used by the runtime to calculate the address. The
6485/// only part of this the compiler needs to know about is the first xword, which
6486/// contains a function pointer that must be called with the address of the
6487/// entire descriptor in "x0".
6488///
6489/// Since this descriptor may be in a different unit, in general even the
6490/// descriptor must be accessed via an indirect load. The "ideal" code sequence
6491/// is:
6492/// adrp x0, _var@TLVPPAGE
6493/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
6494/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
6495/// ; the function pointer
6496/// blr x1 ; Uses descriptor address in x0
6497/// ; Address of _var is now in x0.
6498///
6499/// If the address of _var's descriptor *is* known to the linker, then it can
6500/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
6501/// a slight efficiency gain.
6502SDValue
6503AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
6504 SelectionDAG &DAG) const {
6505 assert(Subtarget->isTargetDarwin() &&
6506 "This function expects a Darwin target");
6507
6508 SDLoc DL(Op);
6511 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
6512
6516
6517 // The first entry in the descriptor is a function pointer that we must call
6518 // to obtain the address of the variable.
6519 SDValue Chain = DAG.getEntryNode();
6521 PtrMemVT, DL, Chain, DescAddr,
6523 Align(PtrMemVT.getSizeInBits() / 8),
6525 Chain = FuncTLVGet.getValue(1);
6526
6527 // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
6529
6531 MFI.setAdjustsStack(true);
6532
6533 // TLS calls preserve all registers except those that absolutely must be
6534 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
6535 // silly).
6536 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
6537 const uint32_t *Mask = TRI->getTLSCallPreservedMask();
6538 if (Subtarget->hasCustomCallingConv())
6539 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
6540
6541 // Finally, we can make the call. This is just a degenerate version of a
6542 // normal AArch64 call node: x0 takes the address of the descriptor, and
6543 // returns the address of the variable in this thread.
6544 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
6545 Chain =
6547 Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
6548 DAG.getRegisterMask(Mask), Chain.getValue(1));
6549 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
6550}
6551
6552/// Convert a thread-local variable reference into a sequence of instructions to
6553/// compute the variable's address for the local exec TLS model of ELF targets.
6554/// The sequence depends on the maximum TLS area size.
6555SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
6557 const SDLoc &DL,
6558 SelectionDAG &DAG) const {
6561
6562 switch (DAG.getTarget().Options.TLSSize) {
6563 default:
6564 llvm_unreachable("Unexpected TLS size");
6565
6566 case 12: {
6567 // mrs x0, TPIDR_EL0
6568 // add x0, x0, :tprel_lo12:a
6571 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
6572 Var,
6573 DAG.getTargetConstant(0, DL, MVT::i32)),
6574 0);
6575 }
6576
6577 case 24: {
6578 // mrs x0, TPIDR_EL0
6579 // add x0, x0, :tprel_hi12:a
6580 // add x0, x0, :tprel_lo12_nc:a
6584 GV, DL, PtrVT, 0,
6586 Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
6587 HiVar,
6588 DAG.getTargetConstant(0, DL, MVT::i32)),
6589 0);
6590 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
6591 LoVar,
6592 DAG.getTargetConstant(0, DL, MVT::i32)),
6593 0);
6594 }
6595
6596 case 32: {
6597 // mrs x1, TPIDR_EL0
6598 // movz x0, #:tprel_g1:a
6599 // movk x0, #:tprel_g0_nc:a
6600 // add x0, x1, x0
6604 GV, DL, PtrVT, 0,
6606 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
6607 DAG.getTargetConstant(16, DL, MVT::i32)),
6608 0);
6609 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
6610 DAG.getTargetConstant(0, DL, MVT::i32)),
6611 0);
6612 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
6613 }
6614
6615 case 48: {
6616 // mrs x1, TPIDR_EL0
6617 // movz x0, #:tprel_g2:a
6618 // movk x0, #:tprel_g1_nc:a
6619 // movk x0, #:tprel_g0_nc:a
6620 // add x0, x1, x0
6624 GV, DL, PtrVT, 0,
6627 GV, DL, PtrVT, 0,
6629 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
6630 DAG.getTargetConstant(32, DL, MVT::i32)),
6631 0);
6632 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,
6633 DAG.getTargetConstant(16, DL, MVT::i32)),
6634 0);
6635 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
6636 DAG.getTargetConstant(0, DL, MVT::i32)),
6637 0);
6638 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
6639 }
6640 }
6641}
6642
6643/// When accessing thread-local variables under either the general-dynamic or
6644/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
6645/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
6646/// is a function pointer to carry out the resolution.
6647///
6648/// The sequence is:
6649/// adrp x0, :tlsdesc:var
6650/// ldr x1, [x0, #:tlsdesc_lo12:var]
6651/// add x0, x0, #:tlsdesc_lo12:var
6652/// .tlsdesccall var
6653/// blr x1
6654/// (TPIDR_EL0 offset now in x0)
6655///
6656/// The above sequence must be produced unscheduled, to enable the linker to
6657/// optimize/relax this sequence.
6658/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
6659/// above sequence, and expanded really late in the compilation flow, to ensure
6660/// the sequence is produced as per above.
6661SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
6662 const SDLoc &DL,
6663 SelectionDAG &DAG) const {
6665
6666 SDValue Chain = DAG.getEntryNode();
6668
6669 Chain =
6671 SDValue Glue = Chain.getValue(1);
6672
6673 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
6674}
6675
6676SDValue
6677AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
6678 SelectionDAG &DAG) const {
6679 assert(Subtarget->isTargetELF() && "This function expects an ELF target");
6680
6682
6683 TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
6684
6686 if (Model == TLSModel::LocalDynamic)
6688 }
6689
6691 Model != TLSModel::LocalExec)
6692 report_fatal_error("ELF TLS only supported in small memory model or "
6693 "in local exec TLS model");
6694 // Different choices can be made for the maximum size of the TLS area for a
6695 // module. For the small address model, the default TLS size is 16MiB and the
6696 // maximum TLS size is 4GiB.
6697 // FIXME: add tiny and large code model support for TLS access models other
6698 // than local exec. We currently generate the same code as small for tiny,
6699 // which may be larger than needed.
6700
6701 SDValue TPOff;
6703 SDLoc DL(Op);
6704 const GlobalValue *GV = GA->getGlobal();
6705
6707
6708 if (Model == TLSModel::LocalExec) {
6709 return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
6710 } else if (Model == TLSModel::InitialExec) {
6713 } else if (Model == TLSModel::LocalDynamic) {
6714 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
6715 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
6716 // the beginning of the module's TLS region, followed by a DTPREL offset
6717 // calculation.
6718
6719 // These accesses will need deduplicating if there's more than one.
6720 AArch64FunctionInfo *MFI =
6723
6724 // The call needs a relocation too for linker relaxation. It doesn't make
6725 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
6726 // the address.
6727 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
6729
6730 // Now we can calculate the offset from TPIDR_EL0 to this module's
6731 // thread-local area.
6732 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
6733
6734 // Now use :dtprel_whatever: operations to calculate this variable's offset
6735 // in its thread-storage area.
6739 GV, DL, MVT::i64, 0,
6741
6742 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
6743 DAG.getTargetConstant(0, DL, MVT::i32)),
6744 0);
6745 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
6746 DAG.getTargetConstant(0, DL, MVT::i32)),
6747 0);
6748 } else if (Model == TLSModel::GeneralDynamic) {
6749 // The call needs a relocation too for linker relaxation. It doesn't make
6750 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
6751 // the address.
6754
6755 // Finally we can make a call to calculate the offset from tpidr_el0.
6756 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
6757 } else
6758 llvm_unreachable("Unsupported ELF TLS access model");
6759
6760 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
6761}
6762
6763SDValue
6764AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
6765 SelectionDAG &DAG) const {
6766 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
6767
6768 SDValue Chain = DAG.getEntryNode();
6770 SDLoc DL(Op);
6771
6772 SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
6773
6774 // Load the ThreadLocalStoragePointer from the TEB
6775 // A pointer to the TLS array is located at offset 0x58 from the TEB.
6777 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
6779 Chain = TLSArray.getValue(1);
6780
6781 // Load the TLS index from the C runtime;
6782 // This does the same as getAddr(), but without having a GlobalAddressSDNode.
6783 // This also does the same as LOADgot, but using a generic i32 load,
6784 // while LOADgot only loads i64.
6793 Chain = TLSIndex.getValue(1);
6794
6795 // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
6796 // offset into the TLSArray.
6799 DAG.getConstant(3, DL, PtrVT));
6800 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
6801 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
6803 Chain = TLS.getValue(1);
6804
6806 const GlobalValue *GV = GA->getGlobal();
6810 GV, DL, PtrVT, 0,
6812
6813 // Add the offset from the start of the .tls section (section base).
6814 SDValue Addr =
6815 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
6816 DAG.getTargetConstant(0, DL, MVT::i32)),
6817 0);
6819 return Addr;
6820}
6821
6822SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
6823 SelectionDAG &DAG) const {
6825 if (DAG.getTarget().useEmulatedTLS())
6826 return LowerToTLSEmulatedModel(GA, DAG);
6827
6828 if (Subtarget->isTargetDarwin())
6829 return LowerDarwinGlobalTLSAddress(Op, DAG);
6830 if (Subtarget->isTargetELF())
6831 return LowerELFGlobalTLSAddress(Op, DAG);
6832 if (Subtarget->isTargetWindows())
6833 return LowerWindowsGlobalTLSAddress(Op, DAG);
6834
6835 llvm_unreachable("Unexpected platform trying to use TLS");
6836}
6837
6838// Looks through \param Val to determine the bit that can be used to
6839// check the sign of the value. It returns the unextended value and
6840// the sign bit position.
6841std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
6842 if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
6843 return {Val.getOperand(0),
6844 cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() -
6845 1};
6846
6847 if (Val.getOpcode() == ISD::SIGN_EXTEND)
6848 return {Val.getOperand(0),
6849 Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1};
6850
6851 return {Val, Val.getValueSizeInBits() - 1};
6852}
6853
6854SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
6855 SDValue Chain = Op.getOperand(0);
6856 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
6857 SDValue LHS = Op.getOperand(2);
6858 SDValue RHS = Op.getOperand(3);
6859 SDValue Dest = Op.getOperand(4);
6860 SDLoc dl(Op);
6861
6863 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
6864 // will not be produced, as they are conditional branch instructions that do
6865 // not set flags.
6866 bool ProduceNonFlagSettingCondBr =
6867 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
6868
6869 // Handle f128 first, since lowering it will result in comparing the return
6870 // value of a libcall against zero, which is just what the rest of LowerBR_CC
6871 // is expecting to deal with.
6872 if (LHS.getValueType() == MVT::f128) {
6873 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
6874
6875 // If softenSetCCOperands returned a scalar, we need to compare the result
6876 // against zero to select between true and false values.
6877 if (!RHS.getNode()) {
6878 RHS = DAG.getConstant(0, dl, LHS.getValueType());
6879 CC = ISD::SETNE;
6880 }
6881 }
6882
6883 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
6884 // instruction.
6885 if (ISD::isOverflowIntrOpRes(LHS) && isOneConstant(RHS) &&
6886 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
6887 // Only lower legal XALUO ops.
6888 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
6889 return SDValue();
6890
6891 // The actual operation with overflow check.
6894 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
6895
6896 if (CC == ISD::SETNE)
6897 OFCC = getInvertedCondCode(OFCC);
6898 SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32);
6899
6900 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
6901 Overflow);
6902 }
6903
6904 if (LHS.getValueType().isInteger()) {
6905 assert((LHS.getValueType() == RHS.getValueType()) &&
6906 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
6907
6908 // If the RHS of the comparison is zero, we can potentially fold this
6909 // to a specialized branch.
6911 if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
6912 if (CC == ISD::SETEQ) {
6913 // See if we can use a TBZ to fold in an AND as well.
6914 // TBZ has a smaller branch displacement than CBZ. If the offset is
6915 // out of bounds, a late MI-layer pass rewrites branches.
6916 // 403.gcc is an example that hits this case.
6917 if (LHS.getOpcode() == ISD::AND &&
6918 isa<ConstantSDNode>(LHS.getOperand(1)) &&
6919 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
6920 SDValue Test = LHS.getOperand(0);
6921 uint64_t Mask = LHS.getConstantOperandVal(1);
6922 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
6923 DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
6924 Dest);
6925 }
6926
6927 return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
6928 } else if (CC == ISD::SETNE) {
6929 // See if we can use a TBZ to fold in an AND as well.
6930 // TBZ has a smaller branch displacement than CBZ. If the offset is
6931 // out of bounds, a late MI-layer pass rewrites branches.
6932 // 403.gcc is an example that hits this case.
6933 if (LHS.getOpcode() == ISD::AND &&
6934 isa<ConstantSDNode>(LHS.getOperand(1)) &&
6935 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
6936 SDValue Test = LHS.getOperand(0);
6937 uint64_t Mask = LHS.getConstantOperandVal(1);
6938 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
6939 DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
6940 Dest);
6941 }
6942
6943 return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
6944 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
6945 // Don't combine AND since emitComparison converts the AND to an ANDS
6946 // (a.k.a. TST) and the test in the test bit and branch instruction
6947 // becomes redundant. This would also increase register pressure.
6948 uint64_t SignBitPos;
6949 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
6950 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
6951 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
6952 }
6953 }
6954 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
6955 LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
6956 // Don't combine AND since emitComparison converts the AND to an ANDS
6957 // (a.k.a. TST) and the test in the test bit and branch instruction
6958 // becomes redundant. This would also increase register pressure.
6959 uint64_t SignBitPos;
6960 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
6961 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
6962 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
6963 }
6964
6965 SDValue CCVal;
6966 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
6967 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
6968 Cmp);
6969 }
6970
6971 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
6972 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
6973
6974 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
6975 // clean. Some of them require two branches to implement.
6976 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
6980 SDValue BR1 =
6981 DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
6982 if (CC2 != AArch64CC::AL) {
6984 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
6985 Cmp);
6986 }
6987
6988 return BR1;
6989}
6990
6991SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
6992 SelectionDAG &DAG) const {
6993 EVT VT = Op.getValueType();
6994 SDLoc DL(Op);
6995
6996 SDValue In1 = Op.getOperand(0);
6997 SDValue In2 = Op.getOperand(1);
6998 EVT SrcVT = In2.getValueType();
6999
7000 if (SrcVT.bitsLT(VT))
7001 In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2);
7002 else if (SrcVT.bitsGT(VT))
7003 In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL));
7004
7005 EVT VecVT;
7006 uint64_t EltMask;
7008
7009 auto setVecVal = [&] (int Idx) {
7010 if (!VT.isVector()) {
7011 VecVal1 = DAG.getTargetInsertSubreg(Idx, DL, VecVT,
7012 DAG.getUNDEF(VecVT), In1);
7013 VecVal2 = DAG.getTargetInsertSubreg(Idx, DL, VecVT,
7014 DAG.getUNDEF(VecVT), In2);
7015 } else {
7016 VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
7017 VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
7018 }
7019 };
7020
7021 if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) {
7022 VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32);
7023 EltMask = 0x80000000ULL;
7024 setVecVal(AArch64::ssub);
7025 } else if (VT == MVT::f64 || VT == MVT::v2f64) {
7026 VecVT = MVT::v2i64;
7027
7028 // We want to materialize a mask with the high bit set, but the AdvSIMD
7029 // immediate moves cannot materialize that in a single instruction for
7030 // 64-bit elements. Instead, materialize zero and then negate it.
7031 EltMask = 0;
7032
7033 setVecVal(AArch64::dsub);
7034 } else if (VT == MVT::f16 || VT == MVT::v4f16 || VT == MVT::v8f16) {
7035 VecVT = (VT == MVT::v4f16 ? MVT::v4i16 : MVT::v8i16);
7036 EltMask = 0x8000ULL;
7037 setVecVal(AArch64::hsub);
7038 } else {
7039 llvm_unreachable("Invalid type for copysign!");
7040 }
7041
7042 SDValue BuildVec = DAG.getConstant(EltMask, DL, VecVT);
7043
7044 // If we couldn't materialize the mask above, then the mask vector will be
7045 // the zero vector, and we need to negate it here.
7046 if (VT == MVT::f64 || VT == MVT::v2f64) {
7050 }
7051
7052 SDValue Sel =
7054
7055 if (VT == MVT::f16)
7056 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, Sel);
7057 if (VT == MVT::f32)
7058 return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel);
7059 else if (VT == MVT::f64)
7060 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel);
7061 else
7062 return DAG.getNode(ISD::BITCAST, DL, VT, Sel);
7063}
7064
7065SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
7067 Attribute::NoImplicitFloat))
7068 return SDValue();
7069
7070 if (!Subtarget->hasNEON())
7071 return SDValue();
7072
7073 // While there is no integer popcount instruction, it can
7074 // be more efficiently lowered to the following sequence that uses
7075 // AdvSIMD registers/instructions as long as the copies to/from
7076 // the AdvSIMD registers are cheap.
7077 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
7078 // CNT V0.8B, V0.8B // 8xbyte pop-counts
7079 // ADDV B0, V0.8B // sum 8xbyte pop-counts
7080 // UMOV X0, V0.B[0] // copy byte result back to integer reg
7081 SDValue Val = Op.getOperand(0);
7082 SDLoc DL(Op);
7083 EVT VT = Op.getValueType();
7084
7085 if (VT == MVT::i32 || VT == MVT::i64) {
7086 if (VT == MVT::i32)
7087 Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
7088 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
7089
7091 SDValue UaddLV = DAG.getNode(
7093 DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);
7094
7095 if (VT == MVT::i64)
7097 return UaddLV;
7098 } else if (VT == MVT::i128) {
7099 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
7100
7102 SDValue UaddLV = DAG.getNode(
7104 DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);
7105
7107 }
7108
7109 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT))
7110 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
7111
7112 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
7113 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
7114 "Unexpected type for custom ctpop lowering");
7115
7117 Val = DAG.getBitcast(VT8Bit, Val);
7118 Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);
7119
7120 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
7121 unsigned EltSize = 8;
7122 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
7123 while (EltSize != VT.getScalarSizeInBits()) {
7124 EltSize *= 2;
7125 NumElts /= 2;
7127 Val = DAG.getNode(
7129 DAG.getConstant(Intrinsic::aarch64_neon_uaddlp, DL, MVT::i32), Val);
7130 }
7131
7132 return Val;
7133}
7134
7135SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
7136 EVT VT = Op.getValueType();
7137 assert(VT.isScalableVector() ||
7138 useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true));
7139
7140 SDLoc DL(Op);
7141 SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0));
7142 return DAG.getNode(ISD::CTLZ, DL, VT, RBIT);
7143}
7144
7145SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
7146 SelectionDAG &DAG) const {
7147 EVT VT = Op.getValueType();
7148
7149 if (VT.isScalableVector() ||
7150 useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
7151 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU,
7152 true);
7153
7154 SDLoc DL(Op);
7155 SDValue REVB;
7156 MVT VST;
7157
7158 switch (VT.getSimpleVT().SimpleTy) {
7159 default:
7160 llvm_unreachable("Invalid type for bitreverse!");
7161
7162 case MVT::v2i32: {
7163 VST = MVT::v8i8;
7164 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
7165
7166 break;
7167 }
7168
7169 case MVT::v4i32: {
7170 VST = MVT::v16i8;
7171 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
7172
7173 break;
7174 }
7175
7176 case MVT::v1i64: {
7177 VST = MVT::v8i8;
7178 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
7179
7180 break;
7181 }
7182
7183 case MVT::v2i64: {
7184 VST = MVT::v16i8;
7185 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
7186
7187 break;
7188 }
7189 }
7190
7191 return DAG.getNode(AArch64ISD::NVCAST, DL, VT,
7193}
7194
7195SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
7196
7197 if (Op.getValueType().isVector())
7198 return LowerVSETCC(Op, DAG);
7199
7200 bool IsStrict = Op->isStrictFPOpcode();
7201 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
7202 unsigned OpNo = IsStrict ? 1 : 0;
7203 SDValue Chain;
7204 if (IsStrict)
7205 Chain = Op.getOperand(0);
7206 SDValue LHS = Op.getOperand(OpNo + 0);
7207 SDValue RHS = Op.getOperand(OpNo + 1);
7208 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();
7209 SDLoc dl(Op);
7210
7211 // We chose ZeroOrOneBooleanContents, so use zero and one.
7212 EVT VT = Op.getValueType();
7213 SDValue TVal = DAG.getConstant(1, dl, VT);
7214 SDValue FVal = DAG.getConstant(0, dl, VT);
7215
7216 // Handle f128 first, since one possible outcome is a normal integer
7217 // comparison which gets picked up by the next if statement.
7218 if (LHS.getValueType() == MVT::f128) {
7219 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS, Chain,
7220 IsSignaling);
7221
7222 // If softenSetCCOperands returned a scalar, use it.
7223 if (!RHS.getNode()) {
7224 assert(LHS.getValueType() == Op.getValueType() &&
7225 "Unexpected setcc expansion!");
7226 return IsStrict ? DAG.getMergeValues({LHS, Chain}, dl) : LHS;
7227 }
7228 }
7229
7230 if (LHS.getValueType().isInteger()) {
7231 SDValue CCVal;
7233 LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, dl);
7234
7235 // Note that we inverted the condition above, so we reverse the order of
7236 // the true and false operands here. This will allow the setcc to be
7237 // matched to a single CSINC instruction.
7238 SDValue Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
7239 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
7240 }
7241
7242 // Now we know we're dealing with FP values.
7243 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
7244 LHS.getValueType() == MVT::f64);
7245
7246 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
7247 // and do the comparison.
7248 SDValue Cmp;
7249 if (IsStrict)
7250 Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling);
7251 else
7252 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
7253
7256 SDValue Res;
7257 if (CC2 == AArch64CC::AL) {
7258 changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
7259 CC2);
7261
7262 // Note that we inverted the condition above, so we reverse the order of
7263 // the true and false operands here. This will allow the setcc to be
7264 // matched to a single CSINC instruction.
7265 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
7266 } else {
7267 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
7268 // totally clean. Some of them require two CSELs to implement. As is in
7269 // this case, we emit the first CSEL and then emit a second using the output
7270 // of the first as the RHS. We're effectively OR'ing the two CC's together.
7271
7272 // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
7274 SDValue CS1 =
7275 DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
7276
7278 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
7279 }
7280 return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, dl) : Res;
7281}
7282
7283SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
7284 SDValue RHS, SDValue TVal,
7285 SDValue FVal, const SDLoc &dl,
7286 SelectionDAG &DAG) const {
7287 // Handle f128 first, because it will result in a comparison of some RTLIB
7288 // call result against zero.
7289 if (LHS.getValueType() == MVT::f128) {
7290 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
7291
7292 // If softenSetCCOperands returned a scalar, we need to compare the result
7293 // against zero to select between true and false values.
7294 if (!RHS.getNode()) {
7295 RHS = DAG.getConstant(0, dl, LHS.getValueType());
7296 CC = ISD::SETNE;
7297 }
7298 }
7299
7300 // Also handle f16, for which we need to do a f32 comparison.
7301 if (LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
7302 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
7303 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
7304 }
7305
7306 // Next, handle integers.
7307 if (LHS.getValueType().isInteger()) {
7308 assert((LHS.getValueType() == RHS.getValueType()) &&
7309 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
7310
7314 // Check for sign pattern (SELECT_CC setgt, iN lhs, -1, 1, -1) and transform
7315 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
7316 // supported types.
7317 if (CC == ISD::SETGT && RHSC && RHSC->isAllOnesValue() && CTVal && CFVal &&
7318 CTVal->isOne() && CFVal->isAllOnesValue() &&
7319 LHS.getValueType() == TVal.getValueType()) {
7320 EVT VT = LHS.getValueType();
7321 SDValue Shift =
7322 DAG.getNode(ISD::SRA, dl, VT, LHS,
7323 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
7324 return DAG.getNode(ISD::OR, dl, VT, Shift, DAG.getConstant(1, dl, VT));
7325 }
7326
7327 unsigned Opcode = AArch64ISD::CSEL;
7328
7329 // If both the TVal and the FVal are constants, see if we can swap them in
7330 // order to for a CSINV or CSINC out of them.
7331 if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) {
7334 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
7335 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) {
7338 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
7339 } else if (TVal.getOpcode() == ISD::XOR) {
7340 // If TVal is a NOT we want to swap TVal and FVal so that we can match
7341 // with a CSINV rather than a CSEL.
7342 if (isAllOnesConstant(TVal.getOperand(1))) {
7345 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
7346 }
7347 } else if (TVal.getOpcode() == ISD::SUB) {
7348 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
7349 // that we can match with a CSNEG rather than a CSEL.
7350 if (isNullConstant(TVal.getOperand(0))) {
7353 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
7354 }
7355 } else if (CTVal && CFVal) {
7356 const int64_t TrueVal = CTVal->getSExtValue();
7357 const int64_t FalseVal = CFVal->getSExtValue();
7358 bool Swap = false;
7359
7360 // If both TVal and FVal are constants, see if FVal is the
7361 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
7362 // instead of a CSEL in that case.
7363 if (TrueVal == ~FalseVal) {
7364 Opcode = AArch64ISD::CSINV;
7365 } else if (FalseVal > std::numeric_limits<int64_t>::min() &&
7366 TrueVal == -FalseVal) {
7367 Opcode = AArch64ISD::CSNEG;
7368 } else if (TVal.getValueType() == MVT::i32) {
7369 // If our operands are only 32-bit wide, make sure we use 32-bit
7370 // arithmetic for the check whether we can use CSINC. This ensures that
7371 // the addition in the check will wrap around properly in case there is
7372 // an overflow (which would not be the case if we do the check with
7373 // 64-bit arithmetic).
7374 const uint32_t TrueVal32 = CTVal->getZExtValue();
7375 const uint32_t FalseVal32 = CFVal->getZExtValue();
7376
7377 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
7378 Opcode = AArch64ISD::CSINC;
7379
7380 if (TrueVal32 > FalseVal32) {
7381 Swap = true;
7382 }
7383 }
7384 // 64-bit check whether we can use CSINC.
7385 } else if ((TrueVal == FalseVal + 1) || (TrueVal + 1 == FalseVal)) {
7386 Opcode = AArch64ISD::CSINC;
7387
7388 if (TrueVal > FalseVal) {
7389 Swap = true;
7390 }
7391 }
7392
7393 // Swap TVal and FVal if necessary.
7394 if (Swap) {
7397 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
7398 }
7399
7400 if (Opcode != AArch64ISD::CSEL) {
7401 // Drop FVal since we can get its value by simply inverting/negating
7402 // TVal.
7403 FVal = TVal;
7404 }
7405 }
7406
7407 // Avoid materializing a constant when possible by reusing a known value in
7408 // a register. However, don't perform this optimization if the known value
7409 // is one, zero or negative one in the case of a CSEL. We can always
7410 // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
7411 // FVal, respectively.
7413 if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
7414 !RHSVal->isNullValue() && !RHSVal->isAllOnesValue()) {
7416 // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
7417 // "a != C ? x : a" to avoid materializing C.
7418 if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
7419 TVal = LHS;
7420 else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
7421 FVal = LHS;
7422 } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
7423 assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
7424 // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
7425 // avoid materializing C.
7427 if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
7428 Opcode = AArch64ISD::CSINV;
7429 TVal = LHS;
7430 FVal = DAG.getConstant(0, dl, FVal.getValueType());
7431 }
7432 }
7433
7434 SDValue CCVal;
7435 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
7436 EVT VT = TVal.getValueType();
7437 return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
7438 }
7439
7440 // Now we know we're dealing with FP values.
7441 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
7442 LHS.getValueType() == MVT::f64);
7443 assert(LHS.getValueType() == RHS.getValueType());
7444 EVT VT = TVal.getValueType();
7445 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
7446
7447 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
7448 // clean. Some of them require two CSELs to implement.
7451
7452 if (DAG.getTarget().Options.UnsafeFPMath) {
7453 // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
7454 // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
7456 if (RHSVal && RHSVal->isZero()) {
7459
7460 if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
7461 CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
7462 TVal = LHS;
7463 else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
7464 CFVal && CFVal->isZero() &&
7465 FVal.getValueType() == LHS.getValueType())
7466 FVal = LHS;
7467 }
7468 }
7469
7470 // Emit first, and possibly only, CSEL.
7472 SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
7473
7474 // If we need a second CSEL, emit it, using the output of the first as the
7475 // RHS. We're effectively OR'ing the two CC's together.
7476 if (CC2 != AArch64CC::AL) {
7478 return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
7479 }
7480
7481 // Otherwise, return the output of the first CSEL.
7482 return CS1;
7483}
7484
7485SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
7486 SelectionDAG &DAG) const {
7487
7488 EVT Ty = Op.getValueType();
7489 auto Idx = Op.getConstantOperandAPInt(2);
7490 if (Idx.sge(-1) && Idx.slt(Ty.getVectorMinNumElements()))
7491 return Op;
7492 return SDValue();
7493}
7494
7495SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
7496 SelectionDAG &DAG) const {
7497 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
7498 SDValue LHS = Op.getOperand(0);
7499 SDValue RHS = Op.getOperand(1);
7500 SDValue TVal = Op.getOperand(2);
7501 SDValue FVal = Op.getOperand(3);
7502 SDLoc DL(Op);
7503 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
7504}
7505
7506SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
7507 SelectionDAG &DAG) const {
7508 SDValue CCVal = Op->getOperand(0);
7509 SDValue TVal = Op->getOperand(1);
7510 SDValue FVal = Op->getOperand(2);
7511 SDLoc DL(Op);
7512
7513 EVT Ty = Op.getValueType();
7514 if (Ty.isScalableVector()) {
7516 MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
7518 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
7519 }
7520
7521 if (useSVEForFixedLengthVectorVT(Ty)) {
7522 // FIXME: Ideally this would be the same as above using i1 types, however
7523 // for the moment we can't deal with fixed i1 vector types properly, so
7524 // instead extend the predicate to a result type sized integer vector.
7525 MVT SplatValVT = MVT::getIntegerVT(Ty.getScalarSizeInBits());
7526 MVT PredVT = MVT::getVectorVT(SplatValVT, Ty.getVectorElementCount());
7529 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
7530 }
7531
7532 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
7533 // instruction.
7534 if (ISD::isOverflowIntrOpRes(CCVal)) {
7535 // Only lower legal XALUO ops.
7536 if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
7537 return SDValue();
7538
7541 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
7542 SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32);
7543
7544 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
7545 CCVal, Overflow);
7546 }
7547
7548 // Lower it the same way as we would lower a SELECT_CC node.
7549 ISD::CondCode CC;
7550 SDValue LHS, RHS;
7551 if (CCVal.getOpcode() == ISD::SETCC) {
7552 LHS = CCVal.getOperand(0);
7553 RHS = CCVal.getOperand(1);
7554 CC = cast<CondCodeSDNode>(CCVal.getOperand(2))->get();
7555 } else {
7556 LHS = CCVal;
7557 RHS = DAG.getConstant(0, DL, CCVal.getValueType());
7558 CC = ISD::SETNE;
7559 }
7560 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
7561}
7562
7563SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
7564 SelectionDAG &DAG) const {
7565 // Jump table entries as PC relative offsets. No additional tweaking
7566 // is necessary here. Just get the address of the jump table.
7568
7570 !Subtarget->isTargetMachO()) {
7571 return getAddrLarge(JT, DAG);
7572 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
7573 return getAddrTiny(JT, DAG);
7574 }
7575 return getAddr(JT, DAG);
7576}
7577
7578SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
7579 SelectionDAG &DAG) const {
7580 // Jump table entries as PC relative offsets. No additional tweaking
7581 // is necessary here. Just get the address of the jump table.
7582 SDLoc DL(Op);
7583 SDValue JT = Op.getOperand(1);
7584 SDValue Entry = Op.getOperand(2);
7585 int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();
7586
7587 auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
7588 AFI->setJumpTableEntryInfo(JTI, 4, nullptr);
7589
7590 SDNode *Dest =
7591 DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
7592 Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
7593 return DAG.getNode(ISD::BRIND, DL, MVT::Other, Op.getOperand(0),
7594 SDValue(Dest, 0));
7595}
7596
7597SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
7598 SelectionDAG &DAG) const {
7600
7602 // Use the GOT for the large code model on iOS.
7603 if (Subtarget->isTargetMachO()) {
7604 return getGOT(CP, DAG);
7605 }
7606 return getAddrLarge(CP, DAG);
7607 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
7608 return getAddrTiny(CP, DAG);
7609 } else {
7610 return getAddr(CP, DAG);
7611 }
7612}
7613
7614SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
7615 SelectionDAG &DAG) const {
7618 !Subtarget->isTargetMachO()) {
7619 return getAddrLarge(BA, DAG);
7620 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
7621 return getAddrTiny(BA, DAG);
7622 }
7623 return getAddr(BA, DAG);
7624}
7625
7626SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
7627 SelectionDAG &DAG) const {
7628 AArch64FunctionInfo *FuncInfo =
7630
7631 SDLoc DL(Op);
7632 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
7635 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
7636 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
7637 MachinePointerInfo(SV));
7638}
7639
7640SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
7641 SelectionDAG &DAG) const {
7642 AArch64FunctionInfo *FuncInfo =
7644
7645 SDLoc DL(Op);
7646 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
7647 ? FuncInfo->getVarArgsGPRIndex()
7648 : FuncInfo->getVarArgsStackIndex(),
7649 getPointerTy(DAG.getDataLayout()));
7650 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
7651 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
7652 MachinePointerInfo(SV));
7653}
7654
7655SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
7656 SelectionDAG &DAG) const {
7657 // The layout of the va_list struct is specified in the AArch64 Procedure Call
7658 // Standard, section B.3.
7661 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
7663 auto PtrVT = getPointerTy(DAG.getDataLayout());
7664 SDLoc DL(Op);
7665
7666 SDValue Chain = Op.getOperand(0);
7667 SDValue VAList = Op.getOperand(1);
7668 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
7670
7671 // void *__stack at offset 0
7672 unsigned Offset = 0;
7674 Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT);
7675 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
7677
7678 // void *__gr_top at offset 8 (4 on ILP32)
7679 Offset += PtrSize;
7680 int GPRSize = FuncInfo->getVarArgsGPRSize();
7681 if (GPRSize > 0) {
7683
7685 DAG.getConstant(Offset, DL, PtrVT));
7686
7687 GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
7689 DAG.getConstant(GPRSize, DL, PtrVT));
7691
7692 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
7694 Align(PtrSize)));
7695 }
7696
7697 // void *__vr_top at offset 16 (8 on ILP32)
7698 Offset += PtrSize;
7699 int FPRSize = FuncInfo->getVarArgsFPRSize();
7700 if (FPRSize > 0) {
7703 DAG.getConstant(Offset, DL, PtrVT));
7704
7705 VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
7707 DAG.getConstant(FPRSize, DL, PtrVT));
7709
7710 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
7712 Align(PtrSize)));
7713 }
7714
7715 // int __gr_offs at offset 24 (12 on ILP32)
7716 Offset += PtrSize;
7718 DAG.getConstant(Offset, DL, PtrVT));
7719 MemOps.push_back(
7720 DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32),
7722
7723 // int __vr_offs at offset 28 (16 on ILP32)
7724 Offset += 4;
7726 DAG.getConstant(Offset, DL, PtrVT));
7727 MemOps.push_back(
7728 DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32),
7730
7732}
7733
7734SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
7735 SelectionDAG &DAG) const {
7737
7738 if (Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()))
7739 return LowerWin64_VASTART(Op, DAG);
7740 else if (Subtarget->isTargetDarwin())
7741 return LowerDarwin_VASTART(Op, DAG);
7742 else
7743 return LowerAAPCS_VASTART(Op, DAG);
7744}
7745
7746SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
7747 SelectionDAG &DAG) const {
7748 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
7749 // pointer.
7750 SDLoc DL(Op);
7751 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
7752 unsigned VaListSize =
7753 (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
7754 ? PtrSize
7755 : Subtarget->isTargetILP32() ? 20 : 32;
7756 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
7757 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
7758
7759 return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
7761 Align(PtrSize), false, false, false,
7763}
7764
7765SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
7766 assert(Subtarget->isTargetDarwin() &&
7767 "automatic va_arg instruction only works on Darwin");
7768
7769 const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
7770 EVT VT = Op.getValueType();
7771 SDLoc DL(Op);
7772 SDValue Chain = Op.getOperand(0);
7773 SDValue Addr = Op.getOperand(1);
7774 MaybeAlign Align(Op.getConstantOperandVal(3));
7775 unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
7776 auto PtrVT = getPointerTy(DAG.getDataLayout());
7778 SDValue VAList =
7779 DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
7780 Chain = VAList.getValue(1);
7782
7783 if (VT.isScalableVector())
7784 report_fatal_error("Passing SVE types to variadic functions is "
7785 "currently not supported");
7786
7787 if (Align && *Align > MinSlotSize) {
7789 DAG.getConstant(Align->value() - 1, DL, PtrVT));
7791 DAG.getConstant(-(int64_t)Align->value(), DL, PtrVT));
7792 }
7793
7794 Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
7795 unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
7796
7797 // Scalar integer and FP values smaller than 64 bits are implicitly extended
7798 // up to 64 bits. At the very least, we have to increase the striding of the
7799 // vaargs list to match this, and for FP values we need to introduce
7800 // FP_ROUND nodes as well.
7801 if (VT.isInteger() && !VT.isVector())
7802 ArgSize = std::max(ArgSize, MinSlotSize);
7803 bool NeedFPTrunc = false;
7804 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
7805 ArgSize = 8;
7806 NeedFPTrunc = true;
7807 }
7808
7809 // Increment the pointer, VAList, to the next vaarg
7811 DAG.getConstant(ArgSize, DL, PtrVT));
7813
7814 // Store the incremented VAList to the legalized pointer
7816 DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
7817
7818 // Load the actual argument out of the pointer VAList
7819 if (NeedFPTrunc) {
7820 // Load the value as an f64.
7821 SDValue WideFP =
7823 // Round the value down to an f32.
7824 SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
7825 DAG.getIntPtrConstant(1, DL));
7826 SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
7827 // Merge the rounded value with the chain output of the load.
7828 return DAG.getMergeValues(Ops, DL);
7829 }
7830
7831 return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
7832}
7833
7834SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
7835 SelectionDAG &DAG) const {
7837 MFI.setFrameAddressIsTaken(true);
7838
7839 EVT VT = Op.getValueType();
7840 SDLoc DL(Op);
7841 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
7843 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
7844 while (Depth--)
7845 FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
7847
7848 if (Subtarget->isTargetILP32())
7850 DAG.getValueType(VT));
7851
7852 return FrameAddr;
7853}
7854
7855SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
7856 SelectionDAG &DAG) const {
7858
7859 EVT VT = getPointerTy(DAG.getDataLayout());
7860 SDLoc DL(Op);
7861 int FI = MFI.CreateFixedObject(4, 0, false);
7862 return DAG.getFrameIndex(FI, VT);
7863}
7864
7865#define GET_REGISTER_MATCHER
7866#include "AArch64GenAsmMatcher.inc"
7867
7868// FIXME? Maybe this could be a TableGen attribute on some registers and
7869// this table could be generated automatically from RegInfo.
7870Register AArch64TargetLowering::
7871getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
7873 if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
7874 const MCRegisterInfo *MRI = Subtarget->getRegisterInfo();
7875 unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
7876 if (!Subtarget->isXRegisterReserved(DwarfRegNum))
7877 Reg = 0;
7878 }
7879 if (Reg)
7880 return Reg;
7881 report_fatal_error(Twine("Invalid register name \""
7882 + StringRef(RegName) + "\"."));
7883}
7884
7885SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
7886 SelectionDAG &DAG) const {
7888
7889 EVT VT = Op.getValueType();
7890 SDLoc DL(Op);
7891
7893 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
7895
7896 return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
7897}
7898
7899SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
7900 SelectionDAG &DAG) const {
7902 MachineFrameInfo &MFI = MF.getFrameInfo();
7903 MFI.setReturnAddressIsTaken(true);
7904
7905 EVT VT = Op.getValueType();
7906 SDLoc DL(Op);
7907 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
7909 if (Depth) {
7910 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
7912 ReturnAddress = DAG.getLoad(
7913 VT, DL, DAG.getEntryNode(),
7915 } else {
7916 // Return LR, which contains the return address. Mark it an implicit
7917 // live-in.
7918 unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
7919 ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
7920 }
7921
7922 // The XPACLRI instruction assembles to a hint-space instruction before
7923 // Armv8.3-A therefore this instruction can be safely used for any pre
7924 // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
7925 // that instead.
7926 SDNode *St;
7927 if (Subtarget->hasPAuth()) {
7928 St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress);
7929 } else {
7930 // XPACLRI operates on LR therefore we must move the operand accordingly.
7931 SDValue Chain =
7932 DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress);
7933 St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain);
7934 }
7935 return SDValue(St, 0);
7936}
7937
7938/// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two
7939/// i32 values and take a 2 x i32 value to shift plus a shift amount.
7940SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op,
7941 SelectionDAG &DAG) const {
7942 SDValue Lo, Hi;
7943 expandShiftParts(Op.getNode(), Lo, Hi, DAG);
7944 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
7945}
7946
7948 const GlobalAddressSDNode *GA) const {
7949 // Offsets are folded in the DAG combine rather than here so that we can
7950 // intelligently choose an offset based on the uses.
7951 return false;
7952}
7953
7955 bool OptForSize) const {
7956 bool IsLegal = false;
7957 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
7958 // 16-bit case when target has full fp16 support.
7959 // FIXME: We should be able to handle f128 as well with a clever lowering.
7960 const APInt ImmInt = Imm.bitcastToAPInt();
7961 if (VT == MVT::f64)
7963 else if (VT == MVT::f32)
7965 else if (VT == MVT::f16 && Subtarget->hasFullFP16())
7967 // TODO: fmov h0, w0 is also legal, however on't have an isel pattern to
7968 // generate that fmov.
7969
7970 // If we can not materialize in immediate field for fmov, check if the
7971 // value can be encoded as the immediate operand of a logical instruction.
7972 // The immediate value will be created with either MOVZ, MOVN, or ORR.
7973 if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
7974 // The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
7975 // however the mov+fmov sequence is always better because of the reduced
7976 // cache pressure. The timings are still the same if you consider
7977 // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
7978 // movw+movk is fused). So we limit up to 2 instrdduction at most.
7980 AArch64_IMM::expandMOVImm(ImmInt.getZExtValue(), VT.getSizeInBits(),
7981 Insn);
7982 unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 5 : 2));
7983 IsLegal = Insn.size() <= Limit;
7984 }
7985
7986 LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT.getEVTString()
7987 << " imm value: "; Imm.dump(););
7988 return IsLegal;
7989}
7990
7991//===----------------------------------------------------------------------===//
7992// AArch64 Optimization Hooks
7993//===----------------------------------------------------------------------===//
7994
7995static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
7996 SDValue Operand, SelectionDAG &DAG,
7997 int &ExtraSteps) {
7998 EVT VT = Operand.getValueType();
7999 if (ST->hasNEON() &&
8000 (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
8001 VT == MVT::f32 || VT == MVT::v1f32 ||
8002 VT == MVT::v2f32 || VT == MVT::v4f32)) {
8004 // For the reciprocal estimates, convergence is quadratic, so the number
8005 // of digits is doubled after each iteration. In ARMv8, the accuracy of
8006 // the initial estimate is 2^-8. Thus the number of extra steps to refine
8007 // the result for float (23 mantissa bits) is 2 and for double (52
8008 // mantissa bits) is 3.
8009 ExtraSteps = VT.getScalarType() == MVT::f64 ? 3 : 2;
8010
8011 return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
8012 }
8013
8014 return SDValue();
8015}
8016
8017SDValue
8018AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
8019 const DenormalMode &Mode) const {
8020 SDLoc DL(Op);
8021 EVT VT = Op.getValueType();
8022 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
8023 SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
8024 return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
8025}
8026
8027SDValue
8028AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
8029 SelectionDAG &DAG) const {
8030 return Op;
8031}
8032
8033SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
8034 SelectionDAG &DAG, int Enabled,
8035 int &ExtraSteps,
8036 bool &UseOneConst,
8037 bool Reciprocal) const {
8040 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
8041 DAG, ExtraSteps)) {
8042 SDLoc DL(Operand);
8043 EVT VT = Operand.getValueType();
8044
8045 SDNodeFlags Flags;
8046 Flags.setAllowReassociation(true);
8047
8048 // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
8049 // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
8050 for (int i = ExtraSteps; i > 0; --i) {
8051 SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
8052 Flags);
8053 Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
8054 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
8055 }
8056 if (!Reciprocal)
8057 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
8058
8059 ExtraSteps = 0;
8060 return Estimate;
8061 }
8062
8063 return SDValue();
8064}
8065
8066SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
8067 SelectionDAG &DAG, int Enabled,
8068 int &ExtraSteps) const {
8070 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
8071 DAG, ExtraSteps)) {
8072 SDLoc DL(Operand);
8073 EVT VT = Operand.getValueType();
8074
8075 SDNodeFlags Flags;
8076 Flags.setAllowReassociation(true);
8077
8078 // Newton reciprocal iteration: E * (2 - X * E)
8079 // AArch64 reciprocal iteration instruction: (2 - M * N)
8080 for (int i = ExtraSteps; i > 0; --i) {
8081 SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
8082 Estimate, Flags);
8083 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
8084 }
8085
8086 ExtraSteps = 0;
8087 return Estimate;
8088 }
8089
8090 return SDValue();
8091}
8092
8093//===----------------------------------------------------------------------===//
8094// AArch64 Inline Assembly Support
8095//===----------------------------------------------------------------------===//
8096
8097// Table of Constraints
8098// TODO: This is the current set of constraints supported by ARM for the
8099// compiler, not all of them may make sense.
8100//
8101// r - A general register
8102// w - An FP/SIMD register of some size in the range v0-v31
8103// x - An FP/SIMD register of some size in the range v0-v15
8104// I - Constant that can be used with an ADD instruction
8105// J - Constant that can be used with a SUB instruction
8106// K - Constant that can be used with a 32-bit logical instruction
8107// L - Constant that can be used with a 64-bit logical instruction
8108// M - Constant that can be used as a 32-bit MOV immediate
8109// N - Constant that can be used as a 64-bit MOV immediate
8110// Q - A memory reference with base register and no offset
8111// S - A symbolic address
8112// Y - Floating point constant zero
8113// Z - Integer constant zero
8114//
8115// Note that general register operands will be output using their 64-bit x
8116// register name, whatever the size of the variable, unless the asm operand
8117// is prefixed by the %w modifier. Floating-point and SIMD register operands
8118// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
8119// %q modifier.
8120const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
8121 // At this point, we have to lower this constraint to something else, so we
8122 // lower it to an "r" or "w". However, by doing this we will force the result
8123 // to be in register, while the X constraint is much more permissive.
8124 //
8125 // Although we are correct (we are free to emit anything, without
8126 // constraints), we might break use cases that would expect us to be more
8127 // efficient and emit something else.
8128 if (!Subtarget->hasFPARMv8())
8129 return "r";
8130
8131 if (ConstraintVT.isFloatingPoint())
8132 return "w";
8133
8134 if (ConstraintVT.isVector() &&
8135 (ConstraintVT.getSizeInBits() == 64 ||
8136 ConstraintVT.getSizeInBits() == 128))
8137 return "w";
8138
8139 return "r";
8140}
8141
8147
8150 if (Constraint == "Upa")
8152 if (Constraint == "Upl")
8154 return P;
8155}
8156
8157/// getConstraintType - Given a constraint letter, return the type of
8158/// constraint it is for this target.
8160AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
8161 if (Constraint.size() == 1) {
8162 switch (Constraint[0]) {
8163 default:
8164 break;
8165 case 'x':
8166 case 'w':
8167 case 'y':
8168 return C_RegisterClass;
8169 // An address with a single base register. Due to the way we
8170 // currently handle addresses it is the same as 'r'.
8171 case 'Q':
8172 return C_Memory;
8173 case 'I':
8174 case 'J':
8175 case 'K':
8176 case 'L':
8177 case 'M':
8178 case 'N':
8179 case 'Y':
8180 case 'Z':
8181 return C_Immediate;
8182 case 'z':
8183 case 'S': // A symbolic address
8184 return C_Other;
8185 }
8186 } else if (parsePredicateConstraint(Constraint) !=
8188 return C_RegisterClass;
8189 return TargetLowering::getConstraintType(Constraint);
8190}
8191
8192/// Examine constraint type and operand type and determine a weight value.
8193/// This object must already have been set up with the operand type
8194/// and the current alternative constraint selected.
8196AArch64TargetLowering::getSingleConstraintMatchWeight(
8197 AsmOperandInfo &info, const char *constraint) const {
8199 Value *CallOperandVal = info.CallOperandVal;
8200 // If we don't have a value, we can't do a match,
8201 // but allow it at the lowest weight.
8202 if (!CallOperandVal)
8203 return CW_Default;
8204 Type *type = CallOperandVal->getType();
8205 // Look at the constraint type.
8206 switch (*constraint) {
8207 default:
8209 break;
8210 case 'x':
8211 case 'w':
8212 case 'y':
8213 if (type->isFloatingPointTy() || type->isVectorTy())
8214 weight = CW_Register;
8215 break;
8216 case 'z':
8217 weight = CW_Constant;
8218 break;
8219 case 'U':
8221 weight = CW_Register;
8222 break;
8223 }
8224 return weight;
8225}
8226
8227std::pair<unsigned, const TargetRegisterClass *>
8228AArch64TargetLowering::getRegForInlineAsmConstraint(
8229 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
8230 if (Constraint.size() == 1) {
8231 switch (Constraint[0]) {
8232 case 'r':
8233 if (VT.isScalableVector())
8234 return std::make_pair(0U, nullptr);
8235 if (Subtarget->hasLS64() && VT.getSizeInBits() == 512)
8236 return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass);
8237 if (VT.getFixedSizeInBits() == 64)
8238 return std::make_pair(0U, &AArch64::GPR64commonRegClass);
8239 return std::make_pair(0U, &AArch64::GPR32commonRegClass);
8240 case 'w': {
8241 if (!Subtarget->hasFPARMv8())
8242 break;
8243 if (VT.isScalableVector()) {
8244 if (VT.getVectorElementType() != MVT::i1)
8245 return std::make_pair(0U, &AArch64::ZPRRegClass);
8246 return std::make_pair(0U, nullptr);
8247 }
8248 uint64_t VTSize = VT.getFixedSizeInBits();
8249 if (VTSize == 16)
8250 return std::make_pair(0U, &AArch64::FPR16RegClass);
8251 if (VTSize == 32)
8252 return std::make_pair(0U, &AArch64::FPR32RegClass);
8253 if (VTSize == 64)
8254 return std::make_pair(0U, &AArch64::FPR64RegClass);
8255 if (VTSize == 128)
8256 return std::make_pair(0U, &AArch64::FPR128RegClass);
8257 break;
8258 }
8259 // The instructions that this constraint is designed for can
8260 // only take 128-bit registers so just use that regclass.
8261 case 'x':
8262 if (!Subtarget->hasFPARMv8())
8263 break;
8264 if (VT.isScalableVector())
8265 return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
8266 if (VT.getSizeInBits() == 128)
8267 return std::make_pair(0U, &AArch64::FPR128_loRegClass);
8268 break;
8269 case 'y':
8270 if (!Subtarget->hasFPARMv8())
8271 break;
8272 if (VT.isScalableVector())
8273 return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
8274 break;
8275 }
8276 } else {
8278 if (PC != PredicateConstraint::Invalid) {
8279 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
8280 return std::make_pair(0U, nullptr);
8281 bool restricted = (PC == PredicateConstraint::Upl);
8282 return restricted ? std::make_pair(0U, &AArch64::PPR_3bRegClass)
8283 : std::make_pair(0U, &AArch64::PPRRegClass);
8284 }
8285 }
8286 if (StringRef("{cc}").equals_insensitive(Constraint))
8287 return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
8288
8289 // Use the default implementation in TargetLowering to convert the register
8290 // constraint into a member of a register class.
8291 std::pair<unsigned, const TargetRegisterClass *> Res;
8293
8294 // Not found as a standard register?
8295 if (!Res.second) {
8296 unsigned Size = Constraint.size();
8297 if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
8298 tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
8299 int RegNo;
8300 bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
8301 if (!Failed && RegNo >= 0 && RegNo <= 31) {
8302 // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
8303 // By default we'll emit v0-v31 for this unless there's a modifier where
8304 // we'll emit the correct register as well.
8305 if (VT != MVT::Other && VT.getSizeInBits() == 64) {
8306 Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
8307 Res.second = &AArch64::FPR64RegClass;
8308 } else {
8309 Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
8310 Res.second = &AArch64::FPR128RegClass;
8311 }
8312 }
8313 }
8314 }
8315
8316 if (Res.second && !Subtarget->hasFPARMv8() &&
8317 !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
8318 !AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
8319 return std::make_pair(0U, nullptr);
8320
8321 return Res;
8322}
8323
8325 llvm::Type *Ty,
8326 bool AllowUnknown) const {
8327 if (Subtarget->hasLS64() && Ty->isIntegerTy(512))
8328 return EVT(MVT::i64x8);
8329
8331}
8332
8333/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
8334/// vector. If it is invalid, don't add anything to Ops.
8335void AArch64TargetLowering::LowerAsmOperandForConstraint(
8336 SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
8337 SelectionDAG &DAG) const {
8338 SDValue Result;
8339
8340 // Currently only support length 1 constraints.
8341 if (Constraint.length() != 1)
8342 return;
8343
8344 char ConstraintLetter = Constraint[0];
8345 switch (ConstraintLetter) {
8346 default:
8347 break;
8348
8349 // This set of constraints deal with valid constants for various instructions.
8350 // Validate and return a target constant for them if we can.
8351 case 'z': {
8352 // 'z' maps to xzr or wzr so it needs an input of 0.
8353 if (!isNullConstant(Op))
8354 return;
8355
8356 if (Op.getValueType() == MVT::i64)
8357 Result = DAG.getRegister(AArch64::XZR, MVT::i64);
8358 else
8359 Result = DAG.getRegister(AArch64::WZR, MVT::i32);
8360 break;
8361 }
8362 case 'S': {
8363 // An absolute symbolic address or label reference.
8365 Result = DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
8366 GA->getValueType(0));
8367 } else if (const BlockAddressSDNode *BA =
8369 Result =
8371 } else
8372 return;
8373 break;
8374 }
8375
8376 case 'I':
8377 case 'J':
8378 case 'K':
8379 case 'L':
8380 case 'M':
8381 case 'N':
8383 if (!C)
8384 return;
8385
8386 // Grab the value and do some validation.
8387 uint64_t CVal = C->getZExtValue();
8388 switch (ConstraintLetter) {
8389 // The I constraint applies only to simple ADD or SUB immediate operands:
8390 // i.e. 0 to 4095 with optional shift by 12
8391 // The J constraint applies only to ADD or SUB immediates that would be
8392 // valid when negated, i.e. if [an add pattern] were to be output as a SUB
8393 // instruction [or vice versa], in other words -1 to -4095 with optional
8394 // left shift by 12.
8395 case 'I':
8396 if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
8397 break;
8398 return;
8399 case 'J': {
8400 uint64_t NVal = -C->getSExtValue();
8402 CVal = C->getSExtValue();
8403 break;
8404 }
8405 return;
8406 }
8407 // The K and L constraints apply *only* to logical immediates, including
8408 // what used to be the MOVI alias for ORR (though the MOVI alias has now
8409 // been removed and MOV should be used). So these constraints have to
8410 // distinguish between bit patterns that are valid 32-bit or 64-bit
8411 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
8412 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
8413 // versa.
8414 case 'K':
8415 if (AArch64_AM::isLogicalImmediate(CVal, 32))
8416 break;
8417 return;
8418 case 'L':
8419 if (AArch64_AM::isLogicalImmediate(CVal, 64))
8420 break;
8421 return;
8422 // The M and N constraints are a superset of K and L respectively, for use
8423 // with the MOV (immediate) alias. As well as the logical immediates they
8424 // also match 32 or 64-bit immediates that can be loaded either using a
8425 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
8426 // (M) or 64-bit 0x1234000000000000 (N) etc.
8427 // As a note some of this code is liberally stolen from the asm parser.
8428 case 'M': {
8429 if (!isUInt<32>(CVal))
8430 return;
8431 if (AArch64_AM::isLogicalImmediate(CVal, 32))
8432 break;
8433 if ((CVal & 0xFFFF) == CVal)
8434 break;
8435 if ((CVal & 0xFFFF0000ULL) == CVal)
8436 break;
8437 uint64_t NCVal = ~(uint32_t)CVal;
8438 if ((NCVal & 0xFFFFULL) == NCVal)
8439 break;
8440 if ((NCVal & 0xFFFF0000ULL) == NCVal)
8441 break;
8442 return;
8443 }
8444 case 'N': {
8445 if (AArch64_AM::isLogicalImmediate(CVal, 64))
8446 break;
8447 if ((CVal & 0xFFFFULL) == CVal)
8448 break;
8449 if ((CVal & 0xFFFF0000ULL) == CVal)
8450 break;
8451 if ((CVal & 0xFFFF00000000ULL) == CVal)
8452 break;
8453 if ((CVal & 0xFFFF000000000000ULL) == CVal)
8454 break;
8455 uint64_t NCVal = ~CVal;
8456 if ((NCVal & 0xFFFFULL) == NCVal)
8457 break;
8458 if ((NCVal & 0xFFFF0000ULL) == NCVal)
8459 break;
8460 if ((NCVal & 0xFFFF00000000ULL) == NCVal)
8461 break;
8462 if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
8463 break;
8464 return;
8465 }
8466 default:
8467 return;
8468 }
8469
8470 // All assembler immediates are 64-bit integers.
8471 Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
8472 break;
8473 }
8474
8475 if (Result.getNode()) {
8476 Ops.push_back(Result);
8477 return;
8478 }
8479
8480 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
8481}
8482
8483//===----------------------------------------------------------------------===//
8484// AArch64 Advanced SIMD Support
8485//===----------------------------------------------------------------------===//
8486
8487/// WidenVector - Given a value in the V64 register class, produce the
8488/// equivalent value in the V128 register class.
8490 EVT VT = V64Reg.getValueType();
8491 unsigned NarrowSize = VT.getVectorNumElements();
8492 MVT EltTy = VT.getVectorElementType().getSimpleVT();
8493 MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
8494 SDLoc DL(V64Reg);
8495
8497 V64Reg, DAG.getConstant(0, DL, MVT::i64));
8498}
8499
8500/// getExtFactor - Determine the adjustment factor for the position when
8501/// generating an "extract from vector registers" instruction.
8502static unsigned getExtFactor(SDValue &V) {
8503 EVT EltType = V.getValueType().getVectorElementType();
8504 return EltType.getSizeInBits() / 8;
8505}
8506
8507/// NarrowVector - Given a value in the V128 register class, produce the
8508/// equivalent value in the V64 register class.
8510 EVT VT = V128Reg.getValueType();
8511 unsigned WideSize = VT.getVectorNumElements();
8512 MVT EltTy = VT.getVectorElementType().getSimpleVT();
8513 MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
8514 SDLoc DL(V128Reg);
8515
8516 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg);
8517}
8518
8519// Gather data to see if the operation can be modelled as a
8520// shuffle in combination with VEXTs.
8522 SelectionDAG &DAG) const {
8523 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
8524 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
8525 SDLoc dl(Op);
8526 EVT VT = Op.getValueType();
8527 assert(!VT.isScalableVector() &&
8528 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
8529 unsigned NumElts = VT.getVectorNumElements();
8530
8531 struct ShuffleSourceInfo {
8532 SDValue Vec;
8533 unsigned MinElt;
8534 unsigned MaxElt;
8535
8536 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
8537 // be compatible with the shuffle we intend to construct. As a result
8538 // ShuffleVec will be some sliding window into the original Vec.
8540
8541 // Code should guarantee that element i in Vec starts at element "WindowBase
8542 // + i * WindowScale in ShuffleVec".
8543 int WindowBase;
8544 int WindowScale;
8545
8547 : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
8548 ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
8549
8550 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
8551 };
8552
8553 // First gather all vectors used as an immediate source for this BUILD_VECTOR
8554 // node.
8556 for (unsigned i = 0; i < NumElts; ++i) {
8557 SDValue V = Op.getOperand(i);
8558 if (V.isUndef())
8559 continue;
8560 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8561 !isa<ConstantSDNode>(V.getOperand(1))) {
8562 LLVM_DEBUG(
8563 dbgs() << "Reshuffle failed: "
8564 "a shuffle can only come from building a vector from "
8565 "various elements of other vectors, provided their "
8566 "indices are constant\n");
8567 return SDValue();
8568 }
8569
8570 // Add this element source to the list if it's not already there.
8571 SDValue SourceVec = V.getOperand(0);
8572 auto Source = find(Sources, SourceVec);
8573 if (Source == Sources.end())
8574 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
8575
8576 // Update the minimum and maximum lane number seen.
8577 unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
8578 Source->MinElt = std::min(Source->MinElt, EltNo);
8579 Source->MaxElt = std::max(Source->MaxElt, EltNo);
8580 }
8581
8582 if (Sources.size() > 2) {
8583 LLVM_DEBUG(
8584 dbgs() << "Reshuffle failed: currently only do something sane when at "
8585 "most two source vectors are involved\n");
8586 return SDValue();
8587 }
8588
8589 // Find out the smallest element size among result and two sources, and use
8590 // it as element size to build the shuffle_vector.
8592 for (auto &Source : Sources) {
8593 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
8594 if (SrcEltTy.bitsLT(SmallestEltTy)) {
8596 }
8597 }
8598 unsigned ResMultiplier =
8599 VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
8600 uint64_t VTSize = VT.getFixedSizeInBits();
8601 NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
8603
8604 // If the source vector is too wide or too narrow, we may nevertheless be able
8605 // to construct a compatible shuffle either by concatenating it with UNDEF or
8606 // extracting a suitable range of elements.
8607 for (auto &Src : Sources) {
8608 EVT SrcVT = Src.ShuffleVec.getValueType();
8609
8610 uint64_t SrcVTSize = SrcVT.getFixedSizeInBits();
8611 if (SrcVTSize == VTSize)
8612 continue;
8613
8614 // This stage of the search produces a source with the same element type as
8615 // the original, but with a total width matching the BUILD_VECTOR output.
8616 EVT EltVT = SrcVT.getVectorElementType();
8617 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
8619
8620 if (SrcVTSize < VTSize) {
8621 assert(2 * SrcVTSize == VTSize);
8622 // We can pad out the smaller vector for free, so if it's part of a
8623 // shuffle...
8624 Src.ShuffleVec =
8625 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
8626 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
8627 continue;
8628 }
8629
8630 if (SrcVTSize != 2 * VTSize) {
8631 LLVM_DEBUG(
8632 dbgs() << "Reshuffle failed: result vector too small to extract\n");
8633 return SDValue();
8634 }
8635
8636 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
8637 LLVM_DEBUG(
8638 dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
8639 return SDValue();
8640 }
8641
8642 if (Src.MinElt >= NumSrcElts) {
8643 // The extraction can just take the second half
8644 Src.ShuffleVec =
8645 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8646 DAG.getConstant(NumSrcElts, dl, MVT::i64));
8647 Src.WindowBase = -NumSrcElts;
8648 } else if (Src.MaxElt < NumSrcElts) {
8649 // The extraction can just take the first half
8650 Src.ShuffleVec =
8651 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8652 DAG.getConstant(0, dl, MVT::i64));
8653 } else {
8654 // An actual VEXT is needed
8656 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8657 DAG.getConstant(0, dl, MVT::i64));
8659 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8660 DAG.getConstant(NumSrcElts, dl, MVT::i64));
8661 unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
8662
8663 if (!SrcVT.is64BitVector()) {
8664 LLVM_DEBUG(
8665 dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
8666 "for SVE vectors.");
8667 return SDValue();
8668 }
8669
8670 Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
8671 VEXTSrc2,
8672 DAG.getConstant(Imm, dl, MVT::i32));
8673 Src.WindowBase = -Src.MinElt;
8674 }
8675 }
8676
8677 // Another possible incompatibility occurs from the vector element types. We
8678 // can fix this by bitcasting the source vectors to the same type we intend
8679 // for the shuffle.
8680 for (auto &Src : Sources) {
8681 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
8682 if (SrcEltTy == SmallestEltTy)
8683 continue;
8684 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
8685 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
8686 Src.WindowScale =
8687 SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
8688 Src.WindowBase *= Src.WindowScale;
8689 }
8690
8691 // Final sanity check before we try to actually produce a shuffle.
8692 LLVM_DEBUG(for (auto Src
8693 : Sources)
8694 assert(Src.ShuffleVec.getValueType() == ShuffleVT););
8695
8696 // The stars all align, our next step is to produce the mask for the shuffle.
8697 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
8698 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
8699 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
8700 SDValue Entry = Op.getOperand(i);
8701 if (Entry.isUndef())
8702 continue;
8703
8704 auto Src = find(Sources, Entry.getOperand(0));
8705 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
8706
8707 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
8708 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
8709 // segment.
8710 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
8711 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
8712 VT.getScalarSizeInBits());
8714
8715 // This source is expected to fill ResMultiplier lanes of the final shuffle,
8716 // starting at the appropriate offset.
8717 int *LaneMask = &Mask[i * ResMultiplier];
8718
8719 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
8720 ExtractBase += NumElts * (Src - Sources.begin());
8721 for (int j = 0; j < LanesDefined; ++j)
8722 LaneMask[j] = ExtractBase + j;
8723 }
8724
8725 // Final check before we try to produce nonsense...
8726 if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
8727 LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
8728 return SDValue();
8729 }
8730
8732 for (unsigned i = 0; i < Sources.size(); ++i)
8733 ShuffleOps[i] = Sources[i].ShuffleVec;
8734
8735 SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
8736 ShuffleOps[1], Mask);
8737 SDValue V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
8738
8739 LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
8740 dbgs() << "Reshuffle, creating node: "; V.dump(););
8741
8742 return V;
8743}
8744
8745// check if an EXT instruction can handle the shuffle mask when the
8746// vector sources of the shuffle are the same.
8747static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
8748 unsigned NumElts = VT.getVectorNumElements();
8749
8750 // Assume that the first shuffle index is not UNDEF. Fail if it is.
8751 if (M[0] < 0)
8752 return false;
8753
8754 Imm = M[0];
8755
8756 // If this is a VEXT shuffle, the immediate value is the index of the first
8757 // element. The other shuffle indices must be the successive elements after
8758 // the first one.
8759 unsigned ExpectedElt = Imm;
8760 for (unsigned i = 1; i < NumElts; ++i) {
8761 // Increment the expected index. If it wraps around, just follow it
8762 // back to index zero and keep going.
8763 ++ExpectedElt;
8764 if (ExpectedElt == NumElts)
8765 ExpectedElt = 0;
8766
8767 if (M[i] < 0)
8768 continue; // ignore UNDEF indices
8769 if (ExpectedElt != static_cast<unsigned>(M[i]))
8770 return false;
8771 }
8772
8773 return true;
8774}
8775
8776/// Check if a vector shuffle corresponds to a DUP instructions with a larger
8777/// element width than the vector lane type. If that is the case the function
8778/// returns true and writes the value of the DUP instruction lane operand into
8779/// DupLaneOp
8780static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
8781 unsigned &DupLaneOp) {
8782 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
8783 "Only possible block sizes for wide DUP are: 16, 32, 64");
8784
8785 if (BlockSize <= VT.getScalarSizeInBits())
8786 return false;
8787 if (BlockSize % VT.getScalarSizeInBits() != 0)
8788 return false;
8789 if (VT.getSizeInBits() % BlockSize != 0)
8790 return false;
8791
8794 size_t NumBlocks = VT.getSizeInBits() / BlockSize;
8795
8796 // We are looking for masks like
8797 // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
8798 // might be replaced by 'undefined'. BlockIndices will eventually contain
8799 // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
8800 // for the above examples)
8802 for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++)
8803 for (size_t I = 0; I < NumEltsPerBlock; I++) {
8804 int Elt = M[BlockIndex * NumEltsPerBlock + I];
8805 if (Elt < 0)
8806 continue;
8807 // For now we don't support shuffles that use the second operand
8808 if ((unsigned)Elt >= SingleVecNumElements)
8809 return false;
8810 if (BlockElts[I] < 0)
8811 BlockElts[I] = Elt;
8812 else if (BlockElts[I] != Elt)
8813 return false;
8814 }
8815
8816 // We found a candidate block (possibly with some undefs). It must be a
8817 // sequence of consecutive integers starting with a value divisible by
8818 // NumEltsPerBlock with some values possibly replaced by undef-s.
8819
8820 // Find first non-undef element
8821 auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; });
8823 "Shuffle with all-undefs must have been caught by previous cases, "
8824 "e.g. isSplat()");
8825 if (FirstRealEltIter == BlockElts.end()) {
8826 DupLaneOp = 0;
8827 return true;
8828 }
8829
8830 // Index of FirstRealElt in BlockElts
8831 size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
8832
8833 if ((unsigned)*FirstRealEltIter < FirstRealIndex)
8834 return false;
8835 // BlockElts[0] must have the following value if it isn't undef:
8837
8838 // Check the first element
8839 if (Elt0 % NumEltsPerBlock != 0)
8840 return false;
8841 // Check that the sequence indeed consists of consecutive integers (modulo
8842 // undefs)
8843 for (size_t I = 0; I < NumEltsPerBlock; I++)
8844 if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I)
8845 return false;
8846
8848 return true;
8849}
8850
8851// check if an EXT instruction can handle the shuffle mask when the
8852// vector sources of the shuffle are different.
8853static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
8854 unsigned &Imm) {
8855 // Look for the first non-undef element.
8856 const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
8857
8858 // Benefit form APInt to handle overflow when calculating expected element.
8859 unsigned NumElts = VT.getVectorNumElements();
8860 unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
8862 // The following shuffle indices must be the successive elements after the
8863 // first real element.
8864 const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(),
8865 [&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;});
8866 if (FirstWrongElt != M.end())
8867 return false;
8868
8869 // The index of an EXT is the first element if it is not UNDEF.
8870 // Watch out for the beginning UNDEFs. The EXT index should be the expected
8871 // value of the first element. E.g.
8872 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
8873 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
8874 // ExpectedElt is the last mask index plus 1.
8875 Imm = ExpectedElt.getZExtValue();
8876
8877 // There are two difference cases requiring to reverse input vectors.
8878 // For example, for vector <4 x i32> we have the following cases,
8879 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
8880 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
8881 // For both cases, we finally use mask <5, 6, 7, 0>, which requires
8882 // to reverse two input vectors.
8883 if (Imm < NumElts)
8884 ReverseEXT = true;
8885 else
8886 Imm -= NumElts;
8887
8888 return true;
8889}
8890
8891/// isREVMask - Check if a vector shuffle corresponds to a REV
8892/// instruction with the specified blocksize. (The order of the elements
8893/// within each block of the vector is reversed.)
8894static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
8895 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
8896 "Only possible block sizes for REV are: 16, 32, 64");
8897
8898 unsigned EltSz = VT.getScalarSizeInBits();
8899 if (EltSz == 64)
8900 return false;
8901
8902 unsigned NumElts = VT.getVectorNumElements();
8903 unsigned BlockElts = M[0] + 1;
8904 // If the first shuffle index is UNDEF, be optimistic.
8905 if (M[0] < 0)
8907
8908 if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
8909 return false;
8910
8911 for (unsigned i = 0; i < NumElts; ++i) {
8912 if (M[i] < 0)
8913 continue; // ignore UNDEF indices
8914 if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
8915 return false;
8916 }
8917
8918 return true;
8919}
8920
8921static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
8922 unsigned NumElts = VT.getVectorNumElements();
8923 if (NumElts % 2 != 0)
8924 return false;
8925 WhichResult = (M[0] == 0 ? 0 : 1);
8926 unsigned Idx = WhichResult * NumElts / 2;
8927 for (unsigned i = 0; i != NumElts; i += 2) {
8928 if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
8929 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts))
8930 return false;
8931 Idx += 1;
8932 }
8933
8934 return true;
8935}
8936
8937static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
8938 unsigned NumElts = VT.getVectorNumElements();
8939 WhichResult = (M[0] == 0 ? 0 : 1);
8940 for (unsigned i = 0; i != NumElts; ++i) {
8941 if (M[i] < 0)
8942 continue; // ignore UNDEF indices
8943 if ((unsigned)M[i] != 2 * i + WhichResult)
8944 return false;
8945 }
8946
8947 return true;
8948}
8949
8950static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
8951 unsigned NumElts = VT.getVectorNumElements();
8952 if (NumElts % 2 != 0)
8953 return false;
8954 WhichResult = (M[0] == 0 ? 0 : 1);
8955 for (unsigned i = 0; i < NumElts; i += 2) {
8956 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
8957 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
8958 return false;
8959 }
8960 return true;
8961}
8962
8963/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
8964/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
8965/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
8966static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
8967 unsigned NumElts = VT.getVectorNumElements();
8968 if (NumElts % 2 != 0)
8969 return false;
8970 WhichResult = (M[0] == 0 ? 0 : 1);
8971 unsigned Idx = WhichResult * NumElts / 2;
8972 for (unsigned i = 0; i != NumElts; i += 2) {
8973 if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
8974 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
8975 return false;
8976 Idx += 1;
8977 }
8978
8979 return true;
8980}
8981
8982/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
8983/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
8984/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
8985static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
8986 unsigned Half = VT.getVectorNumElements() / 2;
8987 WhichResult = (M[0] == 0 ? 0 : 1);
8988 for (unsigned j = 0; j != 2; ++j) {
8989 unsigned Idx = WhichResult;
8990 for (unsigned i = 0; i != Half; ++i) {
8991 int MIdx = M[i + j * Half];
8992 if (MIdx >= 0 && (unsigned)MIdx != Idx)
8993 return false;
8994 Idx += 2;
8995 }
8996 }
8997
8998 return true;
8999}
9000
9001/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
9002/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
9003/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
9004static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
9005 unsigned NumElts = VT.getVectorNumElements();
9006 if (NumElts % 2 != 0)
9007 return false;
9008 WhichResult = (M[0] == 0 ? 0 : 1);
9009 for (unsigned i = 0; i < NumElts; i += 2) {
9010 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
9011 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
9012 return false;
9013 }
9014 return true;
9015}
9016
9018 bool &DstIsLeft, int &Anomaly) {
9019 if (M.size() != static_cast<size_t>(NumInputElements))
9020 return false;
9021
9022 int NumLHSMatch = 0, NumRHSMatch = 0;
9023 int LastLHSMismatch = -1, LastRHSMismatch = -1;
9024
9025 for (int i = 0; i < NumInputElements; ++i) {
9026 if (M[i] == -1) {
9027 ++NumLHSMatch;
9028 ++NumRHSMatch;
9029 continue;
9030 }
9031
9032 if (M[i] == i)
9033 ++NumLHSMatch;
9034 else
9036
9037 if (M[i] == i + NumInputElements)
9038 ++NumRHSMatch;
9039 else
9041 }
9042
9043 if (NumLHSMatch == NumInputElements - 1) {
9044 DstIsLeft = true;
9046 return true;
9047 } else if (NumRHSMatch == NumInputElements - 1) {
9048 DstIsLeft = false;
9050 return true;
9051 }
9052
9053 return false;
9054}
9055
9056static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
9057 if (VT.getSizeInBits() != 128)
9058 return false;
9059
9060 unsigned NumElts = VT.getVectorNumElements();
9061
9062 for (int I = 0, E = NumElts / 2; I != E; I++) {
9063 if (Mask[I] != I)
9064 return false;
9065 }
9066
9067 int Offset = NumElts / 2;
9068 for (int I = NumElts / 2, E = NumElts; I != E; I++) {
9069 if (Mask[I] != I + SplitLHS * Offset)
9070 return false;
9071 }
9072
9073 return true;
9074}
9075
9077 SDLoc DL(Op);
9078 EVT VT = Op.getValueType();
9079 SDValue V0 = Op.getOperand(0);
9080 SDValue V1 = Op.getOperand(1);
9081 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
9082
9083 if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() ||
9084 VT.getVectorElementType() != V1.getValueType().getVectorElementType())
9085 return SDValue();
9086
9087 bool SplitV0 = V0.getValueSizeInBits() == 128;
9088
9089 if (!isConcatMask(Mask, VT, SplitV0))
9090 return SDValue();
9091
9093 if (SplitV0) {
9095 DAG.getConstant(0, DL, MVT::i64));
9096 }
9097 if (V1.getValueSizeInBits() == 128) {
9099 DAG.getConstant(0, DL, MVT::i64));
9100 }
9101 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
9102}
9103
9104/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
9105/// the specified operations to build the shuffle.
9107 SDValue RHS, SelectionDAG &DAG,
9108 const SDLoc &dl) {
9109 unsigned OpNum = (PFEntry >> 26) & 0x0F;
9110 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
9111 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
9112
9113 enum {
9114 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
9115 OP_VREV,
9116 OP_VDUP0,
9117 OP_VDUP1,
9118 OP_VDUP2,
9119 OP_VDUP3,
9120 OP_VEXT1,
9121 OP_VEXT2,
9122 OP_VEXT3,
9123 OP_VUZPL, // VUZP, left result
9124 OP_VUZPR, // VUZP, right result
9125 OP_VZIPL, // VZIP, left result
9126 OP_VZIPR, // VZIP, right result
9127 OP_VTRNL, // VTRN, left result
9128 OP_VTRNR // VTRN, right result
9129 };
9130
9131 if (OpNum == OP_COPY) {
9132 if (LHSID == (1 * 9 + 2) * 9 + 3)
9133 return LHS;
9134 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
9135 return RHS;
9136 }
9137
9141 EVT VT = OpLHS.getValueType();
9142
9143 switch (OpNum) {
9144 default:
9145 llvm_unreachable("Unknown shuffle opcode!");
9146 case OP_VREV:
9147 // VREV divides the vector in half and swaps within the half.
9148 if (VT.getVectorElementType() == MVT::i32 ||
9150 return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
9151 // vrev <4 x i16> -> REV32
9152 if (VT.getVectorElementType() == MVT::i16 ||
9155 return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
9156 // vrev <4 x i8> -> REV16
9158 return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
9159 case OP_VDUP0:
9160 case OP_VDUP1:
9161 case OP_VDUP2:
9162 case OP_VDUP3: {
9163 EVT EltTy = VT.getVectorElementType();
9164 unsigned Opcode;
9165 if (EltTy == MVT::i8)
9166 Opcode = AArch64ISD::DUPLANE8;
9167 else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
9168 Opcode = AArch64ISD::DUPLANE16;
9169 else if (EltTy == MVT::i32 || EltTy == MVT::f32)
9170 Opcode = AArch64ISD::DUPLANE32;
9171 else if (EltTy == MVT::i64 || EltTy == MVT::f64)
9172 Opcode = AArch64ISD::DUPLANE64;
9173 else
9174 llvm_unreachable("Invalid vector element type?");
9175
9176 if (VT.getSizeInBits() == 64)
9177 OpLHS = WidenVector(OpLHS, DAG);
9178 SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64);
9179 return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
9180 }
9181 case OP_VEXT1:
9182 case OP_VEXT2:
9183 case OP_VEXT3: {
9184 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
9185 return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
9186 DAG.getConstant(Imm, dl, MVT::i32));
9187 }
9188 case OP_VUZPL:
9189 return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS,
9190 OpRHS);
9191 case OP_VUZPR:
9192 return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS,
9193 OpRHS);
9194 case OP_VZIPL:
9195 return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS,
9196 OpRHS);
9197 case OP_VZIPR:
9198 return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS,
9199 OpRHS);
9200 case OP_VTRNL:
9201 return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS,
9202 OpRHS);
9203 case OP_VTRNR:
9204 return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS,
9205 OpRHS);
9206 }
9207}
9208
9210 SelectionDAG &DAG) {
9211 // Check to see if we can use the TBL instruction.
9212 SDValue V1 = Op.getOperand(0);
9213 SDValue V2 = Op.getOperand(1);
9214 SDLoc DL(Op);
9215
9216 EVT EltVT = Op.getValueType().getVectorElementType();
9217 unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
9218
9220 for (int Val : ShuffleMask) {
9221 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
9222 unsigned Offset = Byte + Val * BytesPerElt;
9223 TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
9224 }
9225 }
9226
9228 unsigned IndexLen = 8;
9229 if (Op.getValueSizeInBits() == 128) {
9231 IndexLen = 16;
9232 }
9233
9236
9237 SDValue Shuffle;
9238 if (V2.getNode()->isUndef()) {
9239 if (IndexLen == 8)
9241 Shuffle = DAG.getNode(
9243 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
9245 makeArrayRef(TBLMask.data(), IndexLen)));
9246 } else {
9247 if (IndexLen == 8) {
9249 Shuffle = DAG.getNode(
9251 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
9253 makeArrayRef(TBLMask.data(), IndexLen)));
9254 } else {
9255 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
9256 // cannot currently represent the register constraints on the input
9257 // table registers.
9258 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
9259 // DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
9260 // IndexLen));
9261 Shuffle = DAG.getNode(
9263 DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
9265 makeArrayRef(TBLMask.data(), IndexLen)));
9266 }
9267 }
9268 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
9269}
9270
9271static unsigned getDUPLANEOp(EVT EltType) {
9272 if (EltType == MVT::i8)
9273 return AArch64ISD::DUPLANE8;
9274 if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
9275 return AArch64ISD::DUPLANE16;
9276 if (EltType == MVT::i32 || EltType == MVT::f32)
9277 return AArch64ISD::DUPLANE32;
9278 if (EltType == MVT::i64 || EltType == MVT::f64)
9279 return AArch64ISD::DUPLANE64;
9280
9281 llvm_unreachable("Invalid vector element type?");
9282}
9283
9284static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT,
9285 unsigned Opcode, SelectionDAG &DAG) {
9286 // Try to eliminate a bitcasted extract subvector before a DUPLANE.
9287 auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
9288 // Match: dup (bitcast (extract_subv X, C)), LaneC
9289 if (BitCast.getOpcode() != ISD::BITCAST ||
9290 BitCast.getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR)
9291 return false;
9292
9293 // The extract index must align in the destination type. That may not
9294 // happen if the bitcast is from narrow to wide type.
9295 SDValue Extract = BitCast.getOperand(0);
9296 unsigned ExtIdx = Extract.getConstantOperandVal(1);
9297 unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
9298 unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
9299 unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
9301 return false;
9302
9303 // Update the lane value by offsetting with the scaled extract index.
9305
9306 // Determine the casted vector type of the wide vector input.
9307 // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
9308 // Examples:
9309 // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
9310 // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
9311 unsigned SrcVecNumElts =
9313 CastVT = MVT::getVectorVT(BitCast.getSimpleValueType().getScalarType(),
9315 return true;
9316 };
9317 MVT CastVT;
9318 if (getScaledOffsetDup(V, Lane, CastVT)) {
9319 V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0));
9320 } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
9321 // The lane is incremented by the index of the extract.
9322 // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
9323 Lane += V.getConstantOperandVal(1);
9324 V = V.getOperand(0);
9325 } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
9326 // The lane is decremented if we are splatting from the 2nd operand.
9327 // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
9328 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
9329 Lane -= Idx * VT.getVectorNumElements() / 2;
9330 V = WidenVector(V.getOperand(Idx), DAG);
9331 } else if (VT.getSizeInBits() == 64) {
9332 // Widen the operand to 128-bit register with undef.
9333 V = WidenVector(V, DAG);
9334 }
9335 return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64));
9336}
9337
9338SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
9339 SelectionDAG &DAG) const {
9340 SDLoc dl(Op);
9341 EVT VT = Op.getValueType();
9342
9344
9345 if (useSVEForFixedLengthVectorVT(VT))
9346 return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG);
9347
9348 // Convert shuffles that are directly supported on NEON to target-specific
9349 // DAG nodes, instead of keeping them as shuffles and matching them again
9350 // during code selection. This is more efficient and avoids the possibility
9351 // of inconsistencies between legalization and selection.
9352 ArrayRef<int> ShuffleMask = SVN->getMask();
9353
9354 SDValue V1 = Op.getOperand(0);
9355 SDValue V2 = Op.getOperand(1);
9356
9357 assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");
9358 assert(ShuffleMask.size() == VT.getVectorNumElements() &&
9359 "Unexpected VECTOR_SHUFFLE mask size!");
9360
9361 if (SVN->isSplat()) {
9362 int Lane = SVN->getSplatIndex();
9363 // If this is undef splat, generate it via "just" vdup, if possible.
9364 if (Lane == -1)
9365 Lane = 0;
9366
9367 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
9368 return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
9369 V1.getOperand(0));
9370 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
9371 // constant. If so, we can just reference the lane's definition directly.
9372 if (V1.getOpcode() == ISD::BUILD_VECTOR &&
9373 !isa<ConstantSDNode>(V1.getOperand(Lane)))
9374 return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));
9375
9376 // Otherwise, duplicate from the lane of the input vector.
9377 unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
9378 return constructDup(V1, Lane, dl, VT, Opcode, DAG);
9379 }
9380
9381 // Check if the mask matches a DUP for a wider element
9382 for (unsigned LaneSize : {64U, 32U, 16U}) {
9383 unsigned Lane = 0;
9384 if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {
9385 unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
9388 // Cast V1 to an integer vector with required lane size
9390 unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
9392 V1 = DAG.getBitcast(NewVecTy, V1);
9393 // Constuct the DUP instruction
9394 V1 = constructDup(V1, Lane, dl, NewVecTy, Opcode, DAG);
9395 // Cast back to the original type
9396 return DAG.getBitcast(VT, V1);
9397 }
9398 }
9399
9400 if (isREVMask(ShuffleMask, VT, 64))
9401 return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
9402 if (isREVMask(ShuffleMask, VT, 32))
9403 return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
9404 if (isREVMask(ShuffleMask, VT, 16))
9405 return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);
9406
9407 if (((VT.getVectorNumElements() == 8 && VT.getScalarSizeInBits() == 16) ||
9408 (VT.getVectorNumElements() == 16 && VT.getScalarSizeInBits() == 8)) &&
9409 ShuffleVectorInst::isReverseMask(ShuffleMask)) {
9410 SDValue Rev = DAG.getNode(AArch64ISD::REV64, dl, VT, V1);
9411 return DAG.getNode(AArch64ISD::EXT, dl, VT, Rev, Rev,
9412 DAG.getConstant(8, dl, MVT::i32));
9413 }
9414
9415 bool ReverseEXT = false;
9416 unsigned Imm;
9417 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
9418 if (ReverseEXT)
9419 std::swap(V1, V2);
9420 Imm *= getExtFactor(V1);
9421 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
9422 DAG.getConstant(Imm, dl, MVT::i32));
9423 } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
9424 Imm *= getExtFactor(V1);
9425 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
9426 DAG.getConstant(Imm, dl, MVT::i32));
9427 }
9428
9429 unsigned WhichResult;
9430 if (isZIPMask(ShuffleMask, VT, WhichResult)) {
9431 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
9432 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
9433 }
9434 if (isUZPMask(ShuffleMask, VT, WhichResult)) {
9435 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
9436 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
9437 }
9438 if (isTRNMask(ShuffleMask, VT, WhichResult)) {
9439 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
9440 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
9441 }
9442
9443 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
9444 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
9445 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
9446 }
9447 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
9448 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
9449 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
9450 }
9451 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
9452 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
9453 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
9454 }
9455
9457 return Concat;
9458
9459 bool DstIsLeft;
9460 int Anomaly;
9461 int NumInputElements = V1.getValueType().getVectorNumElements();
9462 if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
9465
9466 SDValue SrcVec = V1;
9467 int SrcLane = ShuffleMask[Anomaly];
9468 if (SrcLane >= NumInputElements) {
9469 SrcVec = V2;
9471 }
9473
9475
9476 if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger())
9478
9479 return DAG.getNode(
9482 DstLaneV);
9483 }
9484
9485 // If the shuffle is not directly supported and it has 4 elements, use
9486 // the PerfectShuffle-generated table to synthesize it from other shuffles.
9487 unsigned NumElts = VT.getVectorNumElements();
9488 if (NumElts == 4) {
9489 unsigned PFIndexes[4];
9490 for (unsigned i = 0; i != 4; ++i) {
9491 if (ShuffleMask[i] < 0)
9492 PFIndexes[i] = 8;
9493 else
9494 PFIndexes[i] = ShuffleMask[i];
9495 }
9496
9497 // Compute the index in the perfect shuffle table.
9498 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
9499 PFIndexes[2] * 9 + PFIndexes[3];
9501 unsigned Cost = (PFEntry >> 30);
9502
9503 if (Cost <= 4)
9504 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
9505 }
9506
9507 return GenerateTBL(Op, ShuffleMask, DAG);
9508}
9509
9510SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
9511 SelectionDAG &DAG) const {
9512 SDLoc dl(Op);
9513 EVT VT = Op.getValueType();
9514 EVT ElemVT = VT.getScalarType();
9515 SDValue SplatVal = Op.getOperand(0);
9516
9517 if (useSVEForFixedLengthVectorVT(VT))
9518 return LowerToScalableOp(Op, DAG);
9519
9520 // Extend input splat value where needed to fit into a GPR (32b or 64b only)
9521 // FPRs don't have this restriction.
9522 switch (ElemVT.getSimpleVT().SimpleTy) {
9523 case MVT::i1: {
9524 // The only legal i1 vectors are SVE vectors, so we can use SVE-specific
9525 // lowering code.
9526 if (auto *ConstVal = dyn_cast<ConstantSDNode>(SplatVal)) {
9527 if (ConstVal->isOne())
9528 return getPTrue(DAG, dl, VT, AArch64SVEPredPattern::all);
9529 // TODO: Add special case for constant false
9530 }
9531 // The general case of i1. There isn't any natural way to do this,
9532 // so we use some trickery with whilelo.
9535 DAG.getValueType(MVT::i1));
9536 SDValue ID = DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl,
9537 MVT::i64);
9538 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID,
9539 DAG.getConstant(0, dl, MVT::i64), SplatVal);
9540 }
9541 case MVT::i8:
9542 case MVT::i16:
9543 case MVT::i32:
9545 break;
9546 case MVT::i64:
9548 break;
9549 case MVT::f16:
9550 case MVT::bf16:
9551 case MVT::f32:
9552 case MVT::f64:
9553 // Fine as is
9554 break;
9555 default:
9556 report_fatal_error("Unsupported SPLAT_VECTOR input operand type");
9557 }
9558
9559 return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal);
9560}
9561
9562SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
9563 SelectionDAG &DAG) const {
9564 SDLoc DL(Op);
9565
9566 EVT VT = Op.getValueType();
9567 if (!isTypeLegal(VT) || !VT.isScalableVector())
9568 return SDValue();
9569
9570 // Current lowering only supports the SVE-ACLE types.
9572 return SDValue();
9573
9574 // The DUPQ operation is indepedent of element type so normalise to i64s.
9575 SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
9576 SDValue Idx128 = Op.getOperand(2);
9577
9578 // DUPQ can be used when idx is in range.
9580 if (CIdx && (CIdx->getZExtValue() <= 3)) {
9581 SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
9582 SDNode *DUPQ =
9583 DAG.getMachineNode(AArch64::DUP_ZZI_Q, DL, MVT::nxv2i64, V, CI);
9584 return DAG.getNode(ISD::BITCAST, DL, VT, SDValue(DUPQ, 0));
9585 }
9586
9587 // The ACLE says this must produce the same result as:
9588 // svtbl(data, svadd_x(svptrue_b64(),
9589 // svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
9590 // index * 2))
9591 SDValue One = DAG.getConstant(1, DL, MVT::i64);
9593
9594 // create the vector 0,1,0,1,...
9596 SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
9597
9598 // create the vector idx64,idx64+1,idx64,idx64+1,...
9601 SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
9602
9603 // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
9604 SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
9605 return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
9606}
9607
9608
9610 APInt &UndefBits) {
9611 EVT VT = BVN->getValueType(0);
9612 APInt SplatBits, SplatUndef;
9613 unsigned SplatBitSize;
9614 bool HasAnyUndefs;
9615 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
9616 unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
9617
9618 for (unsigned i = 0; i < NumSplats; ++i) {
9619 CnstBits <<= SplatBitSize;
9620 UndefBits <<= SplatBitSize;
9621 CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
9622 UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
9623 }
9624
9625 return true;
9626 }
9627
9628 return false;
9629}
9630
9631// Try 64-bit splatted SIMD immediate.
9633 const APInt &Bits) {
9634 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
9635 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
9636 EVT VT = Op.getValueType();
9637 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
9638
9641
9642 SDLoc dl(Op);
9643 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
9644 DAG.getConstant(Value, dl, MVT::i32));
9645 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
9646 }
9647 }
9648
9649 return SDValue();
9650}
9651
9652// Try 32-bit splatted SIMD immediate.
9654 const APInt &Bits,
9655 const SDValue *LHS = nullptr) {
9656 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
9657 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
9658 EVT VT = Op.getValueType();
9659 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
9660 bool isAdvSIMDModImm = false;
9661 uint64_t Shift;
9662
9665 Shift = 0;
9666 }
9669 Shift = 8;
9670 }
9673 Shift = 16;
9674 }
9677 Shift = 24;
9678 }
9679
9680 if (isAdvSIMDModImm) {
9681 SDLoc dl(Op);
9682 SDValue Mov;
9683
9684 if (LHS)
9685 Mov = DAG.getNode(NewOp, dl, MovTy, *LHS,
9686 DAG.getConstant(Value, dl, MVT::i32),
9687 DAG.getConstant(Shift, dl, MVT::i32));
9688 else
9689 Mov = DAG.getNode(NewOp, dl, MovTy,
9690 DAG.getConstant(Value, dl, MVT::i32),
9691 DAG.getConstant(Shift, dl, MVT::i32));
9692
9693 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
9694 }
9695 }
9696
9697 return SDValue();
9698}
9699
9700// Try 16-bit splatted SIMD immediate.
9702 const APInt &Bits,
9703 const SDValue *LHS = nullptr) {
9704 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
9705 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
9706 EVT VT = Op.getValueType();
9707 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
9708 bool isAdvSIMDModImm = false;
9709 uint64_t Shift;
9710
9713 Shift = 0;
9714 }
9717 Shift = 8;
9718 }
9719
9720 if (isAdvSIMDModImm) {
9721 SDLoc dl(Op);
9722 SDValue Mov;
9723
9724 if (LHS)
9725 Mov = DAG.getNode(NewOp, dl, MovTy, *LHS,
9726 DAG.getConstant(Value, dl, MVT::i32),
9727 DAG.getConstant(Shift, dl, MVT::i32));
9728 else
9729 Mov = DAG.getNode(NewOp, dl, MovTy,
9730 DAG.getConstant(Value, dl, MVT::i32),
9731 DAG.getConstant(Shift, dl, MVT::i32));
9732
9733 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
9734 }
9735 }
9736
9737 return SDValue();
9738}
9739
9740// Try 32-bit splatted SIMD immediate with shifted ones.
9742 SelectionDAG &DAG, const APInt &Bits) {
9743 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
9744 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
9745 EVT VT = Op.getValueType();
9746 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
9747 bool isAdvSIMDModImm = false;
9748 uint64_t Shift;
9749
9752 Shift = 264;
9753 }
9756 Shift = 272;
9757 }
9758
9759 if (isAdvSIMDModImm) {
9760 SDLoc dl(Op);
9761 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
9762 DAG.getConstant(Value, dl, MVT::i32),
9763 DAG.getConstant(Shift, dl, MVT::i32));
9764 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
9765 }
9766 }
9767
9768 return SDValue();
9769}
9770
9771// Try 8-bit splatted SIMD immediate.
9773 const APInt &Bits) {
9774 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
9775 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
9776 EVT VT = Op.getValueType();
9777 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
9778
9781
9782 SDLoc dl(Op);
9783 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
9784 DAG.getConstant(Value, dl, MVT::i32));
9785 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
9786 }
9787 }
9788
9789 return SDValue();
9790}
9791
9792// Try FP splatted SIMD immediate.
9794 const APInt &Bits) {
9795 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
9796 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
9797 EVT VT = Op.getValueType();
9798 bool isWide = (VT.getSizeInBits() == 128);
9799 MVT MovTy;
9800 bool isAdvSIMDModImm = false;
9801
9805 }
9806 else if (isWide &&
9809 MovTy = MVT::v2f64;
9810 }
9811
9812 if (isAdvSIMDModImm) {
9813 SDLoc dl(Op);
9814 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
9815 DAG.getConstant(Value, dl, MVT::i32));
9816 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
9817 }
9818 }
9819
9820 return SDValue();
9821}
9822
9823// Specialized code to quickly find if PotentialBVec is a BuildVector that
9824// consists of only the same constant int value, returned in reference arg
9825// ConstVal
9827 uint64_t &ConstVal) {
9829 if (!Bvec)
9830 return false;
9832 if (!FirstElt)
9833 return false;
9834 EVT VT = Bvec->getValueType(0);
9835 unsigned NumElts = VT.getVectorNumElements();
9836 for (unsigned i = 1; i < NumElts; ++i)
9837 if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
9838 return false;
9839 ConstVal = FirstElt->getZExtValue();
9840 return true;
9841}
9842
9843static unsigned getIntrinsicID(const SDNode *N) {
9844 unsigned Opcode = N->getOpcode();
9845 switch (Opcode) {
9846 default:
9849 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
9850 if (IID < Intrinsic::num_intrinsics)
9851 return IID;
9853 }
9854 }
9855}
9856
9857// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
9858// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
9859// BUILD_VECTORs with constant element C1, C2 is a constant, and:
9860// - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
9861// - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
9862// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
9864 EVT VT = N->getValueType(0);
9865
9866 if (!VT.isVector())
9867 return SDValue();
9868
9869 SDLoc DL(N);
9870
9871 SDValue And;
9872 SDValue Shift;
9873
9874 SDValue FirstOp = N->getOperand(0);
9875 unsigned FirstOpc = FirstOp.getOpcode();
9876 SDValue SecondOp = N->getOperand(1);
9877 unsigned SecondOpc = SecondOp.getOpcode();
9878
9879 // Is one of the operands an AND or a BICi? The AND may have been optimised to
9880 // a BICi in order to use an immediate instead of a register.
9881 // Is the other operand an shl or lshr? This will have been turned into:
9882 // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift.
9883 if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
9885 And = FirstOp;
9886 Shift = SecondOp;
9887
9888 } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
9890 And = SecondOp;
9891 Shift = FirstOp;
9892 } else
9893 return SDValue();
9894
9895 bool IsAnd = And.getOpcode() == ISD::AND;
9896 bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR;
9897
9898 // Is the shift amount constant?
9900 if (!C2node)
9901 return SDValue();
9902
9903 uint64_t C1;
9904 if (IsAnd) {
9905 // Is the and mask vector all constant?
9906 if (!isAllConstantBuildVector(And.getOperand(1), C1))
9907 return SDValue();
9908 } else {
9909 // Reconstruct the corresponding AND immediate from the two BICi immediates.
9913 C1 = ~(C1nodeImm->getZExtValue() << C1nodeShift->getZExtValue());
9914 }
9915
9916 // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
9917 // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
9918 // how much one can shift elements of a particular size?
9919 uint64_t C2 = C2node->getZExtValue();
9920 unsigned ElemSizeInBits = VT.getScalarSizeInBits();
9921 if (C2 > ElemSizeInBits)
9922 return SDValue();
9923
9927 if (C1AsAPInt != RequiredC1)
9928 return SDValue();
9929
9930 SDValue X = And.getOperand(0);
9931 SDValue Y = Shift.getOperand(0);
9932
9934 SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Shift.getOperand(1));
9935
9936 LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n");
9937 LLVM_DEBUG(N->dump(&DAG));
9938 LLVM_DEBUG(dbgs() << "into: \n");
9939 LLVM_DEBUG(ResultSLI->dump(&DAG));
9940
9942 return ResultSLI;
9943}
9944
9945SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
9946 SelectionDAG &DAG) const {
9947 if (useSVEForFixedLengthVectorVT(Op.getValueType()))
9948 return LowerToScalableOp(Op, DAG);
9949
9950 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
9951 if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
9952 return Res;
9953
9954 EVT VT = Op.getValueType();
9955
9956 SDValue LHS = Op.getOperand(0);
9958 dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
9959 if (!BVN) {
9960 // OR commutes, so try swapping the operands.
9961 LHS = Op.getOperand(1);
9962 BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
9963 }
9964 if (!BVN)
9965 return Op;
9966
9967 APInt DefBits(VT.getSizeInBits(), 0);
9968 APInt UndefBits(VT.getSizeInBits(), 0);
9970 SDValue NewOp;
9971
9973 DefBits, &LHS)) ||
9975 DefBits, &LHS)))
9976 return NewOp;
9977
9979 UndefBits, &LHS)) ||
9981 UndefBits, &LHS)))
9982 return NewOp;
9983 }
9984
9985 // We can always fall back to a non-immediate OR.
9986 return Op;
9987}
9988
9989// Normalize the operands of BUILD_VECTOR. The value of constant operands will
9990// be truncated to fit element width.
9992 SelectionDAG &DAG) {
9993 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
9994 SDLoc dl(Op);
9995 EVT VT = Op.getValueType();
9996 EVT EltTy= VT.getVectorElementType();
9997
9998 if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
9999 return Op;
10000
10002 for (SDValue Lane : Op->ops()) {
10003 // For integer vectors, type legalization would have promoted the
10004 // operands already. Otherwise, if Op is a floating-point splat
10005 // (with operands cast to integers), then the only possibilities
10006 // are constants and UNDEFs.
10007 if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
10008 APInt LowBits(EltTy.getSizeInBits(),
10009 CstLane->getZExtValue());
10010 Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32);
10011 } else if (Lane.getNode()->isUndef()) {
10012 Lane = DAG.getUNDEF(MVT::i32);
10013 } else {
10014 assert(Lane.getValueType() == MVT::i32 &&
10015 "Unexpected BUILD_VECTOR operand type");
10016 }
10017 Ops.push_back(Lane);
10018 }
10019 return DAG.getBuildVector(VT, dl, Ops);
10020}
10021
10023 EVT VT = Op.getValueType();
10024
10025 APInt DefBits(VT.getSizeInBits(), 0);
10026 APInt UndefBits(VT.getSizeInBits(), 0);
10029 SDValue NewOp;
10036 return NewOp;
10037
10038 DefBits = ~DefBits;
10042 return NewOp;
10043
10051 return NewOp;
10052
10057 return NewOp;
10058 }
10059
10060 return SDValue();
10061}
10062
10063SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
10064 SelectionDAG &DAG) const {
10065 EVT VT = Op.getValueType();
10066
10067 // Try to build a simple constant vector.
10068 Op = NormalizeBuildVector(Op, DAG);
10069 if (VT.isInteger()) {
10070 // Certain vector constants, used to express things like logical NOT and
10071 // arithmetic NEG, are passed through unmodified. This allows special
10072 // patterns for these operations to match, which will lower these constants
10073 // to whatever is proven necessary.
10075 if (BVN->isConstant())
10076 if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
10077 unsigned BitSize = VT.getVectorElementType().getSizeInBits();
10078 APInt Val(BitSize,
10079 Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
10080 if (Val.isNullValue() || Val.isAllOnesValue())
10081 return Op;
10082 }
10083 }
10084
10085 if (SDValue V = ConstantBuildVector(Op, DAG))
10086 return V;
10087
10088 // Scan through the operands to find some interesting properties we can
10089 // exploit:
10090 // 1) If only one value is used, we can use a DUP, or
10091 // 2) if only the low element is not undef, we can just insert that, or
10092 // 3) if only one constant value is used (w/ some non-constant lanes),
10093 // we can splat the constant value into the whole vector then fill
10094 // in the non-constant lanes.
10095 // 4) FIXME: If different constant values are used, but we can intelligently
10096 // select the values we'll be overwriting for the non-constant
10097 // lanes such that we can directly materialize the vector
10098 // some other way (MOVI, e.g.), we can be sneaky.
10099 // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
10100 SDLoc dl(Op);
10101 unsigned NumElts = VT.getVectorNumElements();
10102 bool isOnlyLowElement = true;
10103 bool usesOnlyOneValue = true;
10104 bool usesOnlyOneConstantValue = true;
10105 bool isConstant = true;
10106 bool AllLanesExtractElt = true;
10107 unsigned NumConstantLanes = 0;
10108 unsigned NumDifferentLanes = 0;
10109 unsigned NumUndefLanes = 0;
10110 SDValue Value;
10111 SDValue ConstantValue;
10112 for (unsigned i = 0; i < NumElts; ++i) {
10113 SDValue V = Op.getOperand(i);
10114 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
10115 AllLanesExtractElt = false;
10116 if (V.isUndef()) {
10117 ++NumUndefLanes;
10118 continue;
10119 }
10120 if (i > 0)
10121 isOnlyLowElement = false;
10122 if (!isIntOrFPConstant(V))
10123 isConstant = false;
10124
10125 if (isIntOrFPConstant(V)) {
10127 if (!ConstantValue.getNode())
10128 ConstantValue = V;
10129 else if (ConstantValue != V)
10131 }
10132
10133 if (!Value.getNode())
10134 Value = V;
10135 else if (V != Value) {
10136 usesOnlyOneValue = false;
10138 }
10139 }
10140
10141 if (!Value.getNode()) {
10142 LLVM_DEBUG(
10143 dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
10144 return DAG.getUNDEF(VT);
10145 }
10146
10147 // Convert BUILD_VECTOR where all elements but the lowest are undef into
10148 // SCALAR_TO_VECTOR, except for when we have a single-element constant vector
10149 // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
10150 if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) {
10151 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
10152 "SCALAR_TO_VECTOR node\n");
10153 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
10154 }
10155
10156 if (AllLanesExtractElt) {
10157 SDNode *Vector = nullptr;
10158 bool Even = false;
10159 bool Odd = false;
10160 // Check whether the extract elements match the Even pattern <0,2,4,...> or
10161 // the Odd pattern <1,3,5,...>.
10162 for (unsigned i = 0; i < NumElts; ++i) {
10163 SDValue V = Op.getOperand(i);
10164 const SDNode *N = V.getNode();
10165 if (!isa<ConstantSDNode>(N->getOperand(1)))
10166 break;
10167 SDValue N0 = N->getOperand(0);
10168
10169 // All elements are extracted from the same vector.
10170 if (!Vector) {
10171 Vector = N0.getNode();
10172 // Check that the type of EXTRACT_VECTOR_ELT matches the type of
10173 // BUILD_VECTOR.
10174 if (VT.getVectorElementType() !=
10176 break;
10177 } else if (Vector != N0.getNode()) {
10178 Odd = false;
10179 Even = false;
10180 break;
10181 }
10182
10183 // Extracted values are either at Even indices <0,2,4,...> or at Odd
10184 // indices <1,3,5,...>.
10185 uint64_t Val = N->getConstantOperandVal(1);
10186 if (Val == 2 * i) {
10187 Even = true;
10188 continue;
10189 }
10190 if (Val - 1 == 2 * i) {
10191 Odd = true;
10192 continue;
10193 }
10194
10195 // Something does not match: abort.
10196 Odd = false;
10197 Even = false;
10198 break;
10199 }
10200 if (Even || Odd) {
10201 SDValue LHS =
10203 DAG.getConstant(0, dl, MVT::i64));
10204 SDValue RHS =
10206 DAG.getConstant(NumElts, dl, MVT::i64));
10207
10208 if (Even && !Odd)
10209 return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), LHS,
10210 RHS);
10211 if (Odd && !Even)
10212 return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), LHS,
10213 RHS);
10214 }
10215 }
10216
10217 // Use DUP for non-constant splats. For f32 constant splats, reduce to
10218 // i32 and try again.
10219 if (usesOnlyOneValue) {
10220 if (!isConstant) {
10221 if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
10222 Value.getValueType() != VT) {
10223 LLVM_DEBUG(
10224 dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
10225 return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
10226 }
10227
10228 // This is actually a DUPLANExx operation, which keeps everything vectory.
10229
10230 SDValue Lane = Value.getOperand(1);
10231 Value = Value.getOperand(0);
10232 if (Value.getValueSizeInBits() == 64) {
10233 LLVM_DEBUG(
10234 dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
10235 "widening it\n");
10236 Value = WidenVector(Value, DAG);
10237 }
10238
10239 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
10240 return DAG.getNode(Opcode, dl, VT, Value, Lane);
10241 }
10242
10245 EVT EltTy = VT.getVectorElementType();
10246 assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
10247 EltTy == MVT::f64) && "Unsupported floating-point vector type");
10248 LLVM_DEBUG(
10249 dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
10250 "BITCASTS, and try again\n");
10251 MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
10252 for (unsigned i = 0; i < NumElts; ++i)
10253 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
10254 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
10255 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
10256 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
10257 Val.dump(););
10258 Val = LowerBUILD_VECTOR(Val, DAG);
10259 if (Val.getNode())
10260 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
10261 }
10262 }
10263
10264 // If we need to insert a small number of different non-constant elements and
10265 // the vector width is sufficiently large, prefer using DUP with the common
10266 // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
10267 // skip the constant lane handling below.
10268 bool PreferDUPAndInsert =
10269 !isConstant && NumDifferentLanes >= 1 &&
10272
10273 // If there was only one constant value used and for more than one lane,
10274 // start by splatting that value, then replace the non-constant lanes. This
10275 // is better than the default, which will perform a separate initialization
10276 // for each lane.
10278 // Firstly, try to materialize the splat constant.
10279 SDValue Vec = DAG.getSplatBuildVector(VT, dl, ConstantValue),
10280 Val = ConstantBuildVector(Vec, DAG);
10281 if (!Val) {
10282 // Otherwise, materialize the constant and splat it.
10283 Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
10284 DAG.ReplaceAllUsesWith(Vec.getNode(), &Val);
10285 }
10286
10287 // Now insert the non-constant lanes.
10288 for (unsigned i = 0; i < NumElts; ++i) {
10289 SDValue V = Op.getOperand(i);
10290 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
10291 if (!isIntOrFPConstant(V))
10292 // Note that type legalization likely mucked about with the VT of the
10293 // source operand, so we may have to convert it here before inserting.
10294 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
10295 }
10296 return Val;
10297 }
10298
10299 // This will generate a load from the constant pool.
10300 if (isConstant) {
10301 LLVM_DEBUG(
10302 dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
10303 "expansion\n");
10304 return SDValue();
10305 }
10306
10307 // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
10308 if (NumElts >= 4) {
10309 if (SDValue shuffle = ReconstructShuffle(Op, DAG))
10310 return shuffle;
10311 }
10312
10313 if (PreferDUPAndInsert) {
10314 // First, build a constant vector with the common element.
10316 SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops), DAG);
10317 // Next, insert the elements that do not match the common value.
10318 for (unsigned I = 0; I < NumElts; ++I)
10319 if (Op.getOperand(I) != Value)
10320 NewVector =
10322 Op.getOperand(I), DAG.getConstant(I, dl, MVT::i64));
10323
10324 return NewVector;
10325 }
10326
10327 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
10328 // know the default expansion would otherwise fall back on something even
10329 // worse. For a vector with one or two non-undef values, that's
10330 // scalar_to_vector for the elements followed by a shuffle (provided the
10331 // shuffle is valid for the target) and materialization element by element
10332 // on the stack followed by a load for everything else.
10333 if (!isConstant && !usesOnlyOneValue) {
10334 LLVM_DEBUG(
10335 dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
10336 "of INSERT_VECTOR_ELT\n");
10337
10338 SDValue Vec = DAG.getUNDEF(VT);
10339 SDValue Op0 = Op.getOperand(0);
10340 unsigned i = 0;
10341
10342 // Use SCALAR_TO_VECTOR for lane zero to
10343 // a) Avoid a RMW dependency on the full vector register, and
10344 // b) Allow the register coalescer to fold away the copy if the
10345 // value is already in an S or D register, and we're forced to emit an
10346 // INSERT_SUBREG that we can't fold anywhere.
10347 //
10348 // We also allow types like i8 and i16 which are illegal scalar but legal
10349 // vector element types. After type-legalization the inserted value is
10350 // extended (i32) and it is safe to cast them to the vector type by ignoring
10351 // the upper bits of the lowest lane (e.g. v8i8, v4i16).
10352 if (!Op0.isUndef()) {
10353 LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
10354 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0);
10355 ++i;
10356 }
10357 LLVM_DEBUG(if (i < NumElts) dbgs()
10358 << "Creating nodes for the other vector elements:\n";);
10359 for (; i < NumElts; ++i) {
10360 SDValue V = Op.getOperand(i);
10361 if (V.isUndef())
10362 continue;
10363 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
10364 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
10365 }
10366 return Vec;
10367 }
10368
10369 LLVM_DEBUG(
10370 dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
10371 "better alternative\n");
10372 return SDValue();
10373}
10374
10375SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
10376 SelectionDAG &DAG) const {
10377 if (useSVEForFixedLengthVectorVT(Op.getValueType()))
10378 return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
10379
10380 assert(Op.getValueType().isScalableVector() &&
10381 isTypeLegal(Op.getValueType()) &&
10382 "Expected legal scalable vector type!");
10383
10384 if (isTypeLegal(Op.getOperand(0).getValueType()) && Op.getNumOperands() == 2)
10385 return Op;
10386
10387 return SDValue();
10388}
10389
10390SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
10391 SelectionDAG &DAG) const {
10392 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
10393
10394 if (useSVEForFixedLengthVectorVT(Op.getValueType()))
10395 return LowerFixedLengthInsertVectorElt(Op, DAG);
10396
10397 // Check for non-constant or out of range lane.
10398 EVT VT = Op.getOperand(0).getValueType();
10399
10400 if (VT.getScalarType() == MVT::i1) {
10402 SDLoc DL(Op);
10404 DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, VectorVT);
10405 SDValue ExtendedValue =
10406 DAG.getAnyExtOrTrunc(Op.getOperand(1), DL,
10407 VectorVT.getScalarType().getSizeInBits() < 32
10408 ? MVT::i32
10409 : VectorVT.getScalarType());
10412 ExtendedValue, Op.getOperand(2));
10413 return DAG.getAnyExtOrTrunc(ExtendedVector, DL, VT);
10414 }
10415
10416 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
10417 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
10418 return SDValue();
10419
10420 // Insertion/extraction are legal for V128 types.
10421 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
10422 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
10423 VT == MVT::v8f16 || VT == MVT::v8bf16)
10424 return Op;
10425
10426 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
10427 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
10428 VT != MVT::v4bf16)
10429 return SDValue();
10430
10431 // For V64 types, we perform insertion by expanding the value
10432 // to a V128 type and perform the insertion on that.
10433 SDLoc DL(Op);
10434 SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
10435 EVT WideTy = WideVec.getValueType();
10436
10438 Op.getOperand(1), Op.getOperand(2));
10439 // Re-narrow the resultant vector.
10440 return NarrowVector(Node, DAG);
10441}
10442
10443SDValue
10444AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
10445 SelectionDAG &DAG) const {
10446 assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
10447 EVT VT = Op.getOperand(0).getValueType();
10448
10449 if (VT.getScalarType() == MVT::i1) {
10450 // We can't directly extract from an SVE predicate; extend it first.
10451 // (This isn't the only possible lowering, but it's straightforward.)
10453 SDLoc DL(Op);
10454 SDValue Extend =
10455 DAG.getNode(ISD::ANY_EXTEND, DL, VectorVT, Op.getOperand(0));
10458 Extend, Op.getOperand(1));
10459 return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType());
10460 }
10461
10462 if (useSVEForFixedLengthVectorVT(VT))
10463 return LowerFixedLengthExtractVectorElt(Op, DAG);
10464
10465 // Check for non-constant or out of range lane.
10466 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
10467 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
10468 return SDValue();
10469
10470 // Insertion/extraction are legal for V128 types.
10471 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
10472 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
10473 VT == MVT::v8f16 || VT == MVT::v8bf16)
10474 return Op;
10475
10476 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
10477 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
10478 VT != MVT::v4bf16)
10479 return SDValue();
10480
10481 // For V64 types, we perform extraction by expanding the value
10482 // to a V128 type and perform the extraction on that.
10483 SDLoc DL(Op);
10484 SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
10485 EVT WideTy = WideVec.getValueType();
10486
10487 EVT ExtrTy = WideTy.getVectorElementType();
10488 if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
10489 ExtrTy = MVT::i32;
10490
10491 // For extractions, we just return the result directly.
10493 Op.getOperand(1));
10494}
10495
10496SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
10497 SelectionDAG &DAG) const {
10498 assert(Op.getValueType().isFixedLengthVector() &&
10499 "Only cases that extract a fixed length vector are supported!");
10500
10501 EVT InVT = Op.getOperand(0).getValueType();
10502 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
10503 unsigned Size = Op.getValueSizeInBits();
10504
10505 if (InVT.isScalableVector()) {
10506 // This will be matched by custom code during ISelDAGToDAG.
10507 if (Idx == 0 && isPackedVectorType(InVT, DAG))
10508 return Op;
10509
10510 return SDValue();
10511 }
10512
10513 // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
10514 if (Idx == 0 && InVT.getSizeInBits() <= 128)
10515 return Op;
10516
10517 // If this is extracting the upper 64-bits of a 128-bit vector, we match
10518 // that directly.
10519 if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64 &&
10520 InVT.getSizeInBits() == 128)
10521 return Op;
10522
10523 return SDValue();
10524}
10525
10526SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
10527 SelectionDAG &DAG) const {
10528 assert(Op.getValueType().isScalableVector() &&
10529 "Only expect to lower inserts into scalable vectors!");
10530
10531 EVT InVT = Op.getOperand(1).getValueType();
10532 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
10533
10534 if (InVT.isScalableVector()) {
10535 SDLoc DL(Op);
10536 EVT VT = Op.getValueType();
10537
10538 if (!isTypeLegal(VT) || !VT.isInteger())
10539 return SDValue();
10540
10541 SDValue Vec0 = Op.getOperand(0);
10542 SDValue Vec1 = Op.getOperand(1);
10543
10544 // Ensure the subvector is half the size of the main vector.
10545 if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
10546 return SDValue();
10547
10548 // Extend elements of smaller vector...
10549 EVT WideVT = InVT.widenIntegerVectorElementType(*(DAG.getContext()));
10551
10552 if (Idx == 0) {
10554 return DAG.getNode(AArch64ISD::UZP1, DL, VT, ExtVec, HiVec0);
10555 } else if (Idx == InVT.getVectorMinNumElements()) {
10557 return DAG.getNode(AArch64ISD::UZP1, DL, VT, LoVec0, ExtVec);
10558 }
10559
10560 return SDValue();
10561 }
10562
10563 // This will be matched by custom code during ISelDAGToDAG.
10564 if (Idx == 0 && isPackedVectorType(InVT, DAG) && Op.getOperand(0).isUndef())
10565 return Op;
10566
10567 return SDValue();
10568}
10569
10570SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
10571 EVT VT = Op.getValueType();
10572
10573 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
10574 return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
10575
10576 assert(VT.isScalableVector() && "Expected a scalable vector.");
10577
10578 bool Signed = Op.getOpcode() == ISD::SDIV;
10580
10581 if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64)
10582 return LowerToPredicatedOp(Op, DAG, PredOpcode);
10583
10584 // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
10585 // operations, and truncate the result.
10586 EVT WidenedVT;
10587 if (VT == MVT::nxv16i8)
10589 else if (VT == MVT::nxv8i16)
10591 else
10592 llvm_unreachable("Unexpected Custom DIV operation");
10593
10594 SDLoc dl(Op);
10597 SDValue Op0Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(0));
10598 SDValue Op1Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(1));
10599 SDValue Op0Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(0));
10600 SDValue Op1Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(1));
10601 SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Lo, Op1Lo);
10602 SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Hi, Op1Hi);
10603 return DAG.getNode(AArch64ISD::UZP1, dl, VT, ResultLo, ResultHi);
10604}
10605
10607 // Currently no fixed length shuffles that require SVE are legal.
10608 if (useSVEForFixedLengthVectorVT(VT))
10609 return false;
10610
10611 if (VT.getVectorNumElements() == 4 &&
10612 (VT.is128BitVector() || VT.is64BitVector())) {
10613 unsigned PFIndexes[4];
10614 for (unsigned i = 0; i != 4; ++i) {
10615 if (M[i] < 0)
10616 PFIndexes[i] = 8;
10617 else
10618 PFIndexes[i] = M[i];
10619 }
10620
10621 // Compute the index in the perfect shuffle table.
10622 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
10623 PFIndexes[2] * 9 + PFIndexes[3];
10625 unsigned Cost = (PFEntry >> 30);
10626
10627 if (Cost <= 4)
10628 return true;
10629 }
10630
10631 bool DummyBool;
10632 int DummyInt;
10633 unsigned DummyUnsigned;
10634
10635 return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) ||
10636 isREVMask(M, VT, 32) || isREVMask(M, VT, 16) ||
10638 // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM.
10639 isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) ||
10640 isZIPMask(M, VT, DummyUnsigned) ||
10645 isConcatMask(M, VT, VT.getSizeInBits() == 128));
10646}
10647
10648/// getVShiftImm - Check if this is a valid build_vector for the immediate
10649/// operand of a vector shift operation, where all the elements of the
10650/// build_vector must have the same constant integer value.
10651static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
10652 // Ignore bit_converts.
10653 while (Op.getOpcode() == ISD::BITCAST)
10654 Op = Op.getOperand(0);
10656 APInt SplatBits, SplatUndef;
10657 unsigned SplatBitSize;
10658 bool HasAnyUndefs;
10659 if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
10661 SplatBitSize > ElementBits)
10662 return false;
10663 Cnt = SplatBits.getSExtValue();
10664 return true;
10665}
10666
10667/// isVShiftLImm - Check if this is a valid build_vector for the immediate
10668/// operand of a vector shift left operation. That value must be in the range:
10669/// 0 <= Value < ElementBits for a left shift; or
10670/// 0 <= Value <= ElementBits for a long left shift.
10671static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
10672 assert(VT.isVector() && "vector shift count is not a vector type");
10673 int64_t ElementBits = VT.getScalarSizeInBits();
10674 if (!getVShiftImm(Op, ElementBits, Cnt))
10675 return false;
10676 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
10677}
10678
10679/// isVShiftRImm - Check if this is a valid build_vector for the immediate
10680/// operand of a vector shift right operation. The value must be in the range:
10681/// 1 <= Value <= ElementBits for a right shift; or
10682static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
10683 assert(VT.isVector() && "vector shift count is not a vector type");
10684 int64_t ElementBits = VT.getScalarSizeInBits();
10685 if (!getVShiftImm(Op, ElementBits, Cnt))
10686 return false;
10687 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
10688}
10689
10690SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
10691 SelectionDAG &DAG) const {
10692 EVT VT = Op.getValueType();
10693
10694 if (VT.getScalarType() == MVT::i1) {
10695 // Lower i1 truncate to `(x & 1) != 0`.
10696 SDLoc dl(Op);
10697 EVT OpVT = Op.getOperand(0).getValueType();
10698 SDValue Zero = DAG.getConstant(0, dl, OpVT);
10699 SDValue One = DAG.getConstant(1, dl, OpVT);
10700 SDValue And = DAG.getNode(ISD::AND, dl, OpVT, Op.getOperand(0), One);
10701 return DAG.getSetCC(dl, VT, And, Zero, ISD::SETNE);
10702 }
10703
10704 if (!VT.isVector() || VT.isScalableVector())
10705 return SDValue();
10706
10707 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType()))
10708 return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
10709
10710 return SDValue();
10711}
10712
10713SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
10714 SelectionDAG &DAG) const {
10715 EVT VT = Op.getValueType();
10716 SDLoc DL(Op);
10717 int64_t Cnt;
10718
10719 if (!Op.getOperand(1).getValueType().isVector())
10720 return Op;
10721 unsigned EltSize = VT.getScalarSizeInBits();
10722
10723 switch (Op.getOpcode()) {
10724 default:
10725 llvm_unreachable("unexpected shift opcode");
10726
10727 case ISD::SHL:
10728 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT))
10729 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED);
10730
10731 if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
10732 return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
10733 DAG.getConstant(Cnt, DL, MVT::i32));
10734 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
10735 DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
10736 MVT::i32),
10737 Op.getOperand(0), Op.getOperand(1));
10738 case ISD::SRA:
10739 case ISD::SRL:
10740 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT)) {
10741 unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
10743 return LowerToPredicatedOp(Op, DAG, Opc);
10744 }
10745
10746 // Right shift immediate
10747 if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
10748 unsigned Opc =
10749 (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
10750 return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
10751 DAG.getConstant(Cnt, DL, MVT::i32));
10752 }
10753
10754 // Right shift register. Note, there is not a shift right register
10755 // instruction, but the shift left register instruction takes a signed
10756 // value, where negative numbers specify a right shift.
10757 unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
10759 // negate the shift amount
10760 SDValue NegShift = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
10761 Op.getOperand(1));
10764 DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
10765 NegShift);
10766 return NegShiftLeft;
10767 }
10768
10769 return SDValue();
10770}
10771
10773 AArch64CC::CondCode CC, bool NoNans, EVT VT,
10774 const SDLoc &dl, SelectionDAG &DAG) {
10775 EVT SrcVT = LHS.getValueType();
10776 assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
10777 "function only supposed to emit natural comparisons");
10778
10780 APInt CnstBits(VT.getSizeInBits(), 0);
10781 APInt UndefBits(VT.getSizeInBits(), 0);
10783 bool IsZero = IsCnst && (CnstBits == 0);
10784
10785 if (SrcVT.getVectorElementType().isFloatingPoint()) {
10786 switch (CC) {
10787 default:
10788 return SDValue();
10789 case AArch64CC::NE: {
10790 SDValue Fcmeq;
10791 if (IsZero)
10792 Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
10793 else
10794 Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
10795 return DAG.getNOT(dl, Fcmeq, VT);
10796 }
10797 case AArch64CC::EQ:
10798 if (IsZero)
10799 return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
10800 return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
10801 case AArch64CC::GE:
10802 if (IsZero)
10803 return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
10804 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
10805 case AArch64CC::GT:
10806 if (IsZero)
10807 return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
10808 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
10809 case AArch64CC::LS:
10810 if (IsZero)
10811 return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
10812 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
10813 case AArch64CC::LT:
10814 if (!NoNans)
10815 return SDValue();
10816 // If we ignore NaNs then we can use to the MI implementation.
10818 case AArch64CC::MI:
10819 if (IsZero)
10820 return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
10821 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
10822 }
10823 }
10824
10825 switch (CC) {
10826 default:
10827 return SDValue();
10828 case AArch64CC::NE: {
10829 SDValue Cmeq;
10830 if (IsZero)
10831 Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
10832 else
10833 Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
10834 return DAG.getNOT(dl, Cmeq, VT);
10835 }
10836 case AArch64CC::EQ:
10837 if (IsZero)
10838 return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
10839 return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
10840 case AArch64CC::GE:
10841 if (IsZero)
10842 return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
10843 return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
10844 case AArch64CC::GT:
10845 if (IsZero)
10846 return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
10847 return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
10848 case AArch64CC::LE:
10849 if (IsZero)
10850 return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
10851 return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
10852 case AArch64CC::LS:
10853 return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
10854 case AArch64CC::LO:
10855 return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
10856 case AArch64CC::LT:
10857 if (IsZero)
10858 return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
10859 return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
10860 case AArch64CC::HI:
10861 return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
10862 case AArch64CC::HS:
10863 return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
10864 }
10865}
10866
10867SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
10868 SelectionDAG &DAG) const {
10869 if (Op.getValueType().isScalableVector())
10870 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
10871
10872 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType()))
10873 return LowerFixedLengthVectorSetccToSVE(Op, DAG);
10874
10875 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
10876 SDValue LHS = Op.getOperand(0);
10877 SDValue RHS = Op.getOperand(1);
10878 EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
10879 SDLoc dl(Op);
10880
10881 if (LHS.getValueType().getVectorElementType().isInteger()) {
10882 assert(LHS.getValueType() == RHS.getValueType());
10884 SDValue Cmp =
10885 EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG);
10886 return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
10887 }
10888
10889 const bool FullFP16 =
10890 static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
10891
10892 // Make v4f16 (only) fcmp operations utilise vector instructions
10893 // v8f16 support will be a litle more complicated
10894 if (!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) {
10895 if (LHS.getValueType().getVectorNumElements() == 4) {
10896 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS);
10897 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS);
10898 SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC);
10900 CmpVT = MVT::v4i32;
10901 } else
10902 return SDValue();
10903 }
10904
10905 assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) ||
10906 LHS.getValueType().getVectorElementType() != MVT::f128);
10907
10908 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
10909 // clean. Some of them require two branches to implement.
10911 bool ShouldInvert;
10913
10915 SDValue Cmp =
10916 EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
10917 if (!Cmp.getNode())
10918 return SDValue();
10919
10920 if (CC2 != AArch64CC::AL) {
10921 SDValue Cmp2 =
10922 EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
10923 if (!Cmp2.getNode())
10924 return SDValue();
10925
10926 Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2);
10927 }
10928
10929 Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
10930
10931 if (ShouldInvert)
10932 Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
10933
10934 return Cmp;
10935}
10936
10938 SelectionDAG &DAG) {
10939 SDValue VecOp = ScalarOp.getOperand(0);
10940 auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
10941 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
10942 DAG.getConstant(0, DL, MVT::i64));
10943}
10944
10945SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
10946 SelectionDAG &DAG) const {
10947 SDValue Src = Op.getOperand(0);
10948
10949 // Try to lower fixed length reductions to SVE.
10950 EVT SrcVT = Src.getValueType();
10951 bool OverrideNEON = Op.getOpcode() == ISD::VECREDUCE_AND ||
10952 Op.getOpcode() == ISD::VECREDUCE_OR ||
10953 Op.getOpcode() == ISD::VECREDUCE_XOR ||
10954 Op.getOpcode() == ISD::VECREDUCE_FADD ||
10955 (Op.getOpcode() != ISD::VECREDUCE_ADD &&
10956 SrcVT.getVectorElementType() == MVT::i64);
10957 if (SrcVT.isScalableVector() ||
10958 useSVEForFixedLengthVectorVT(SrcVT, OverrideNEON)) {
10959
10960 if (SrcVT.getVectorElementType() == MVT::i1)
10961 return LowerPredReductionToSVE(Op, DAG);
10962
10963 switch (Op.getOpcode()) {
10964 case ISD::VECREDUCE_ADD:
10965 return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG);
10966 case ISD::VECREDUCE_AND:
10967 return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG);
10968 case ISD::VECREDUCE_OR:
10969 return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG);
10971 return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG);
10973 return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG);
10975 return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG);
10977 return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG);
10978 case ISD::VECREDUCE_XOR:
10979 return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG);
10981 return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG);
10983 return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG);
10985 return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG);
10986 default:
10987 llvm_unreachable("Unhandled fixed length reduction");
10988 }
10989 }
10990
10991 // Lower NEON reductions.
10992 SDLoc dl(Op);
10993 switch (Op.getOpcode()) {
10994 case ISD::VECREDUCE_ADD:
10995 return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG);
10997 return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG);
10999 return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG);
11001 return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG);
11003 return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG);
11004 case ISD::VECREDUCE_FMAX: {
11005 return DAG.getNode(
11006 ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
11007 DAG.getConstant(Intrinsic::aarch64_neon_fmaxnmv, dl, MVT::i32),
11008 Src);
11009 }
11010 case ISD::VECREDUCE_FMIN: {
11011 return DAG.getNode(
11012 ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
11013 DAG.getConstant(Intrinsic::aarch64_neon_fminnmv, dl, MVT::i32),
11014 Src);
11015 }
11016 default:
11017 llvm_unreachable("Unhandled reduction");
11018 }
11019}
11020
11021SDValue AArch64TargetLowering::LowerATOMIC_LOAD_SUB(SDValue Op,
11022 SelectionDAG &DAG) const {
11023 auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget());
11024 if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
11025 return SDValue();
11026
11027 // LSE has an atomic load-add instruction, but not a load-sub.
11028 SDLoc dl(Op);
11029 MVT VT = Op.getSimpleValueType();
11030 SDValue RHS = Op.getOperand(2);
11031 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
11032 RHS = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), RHS);
11033 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, AN->getMemoryVT(),
11034 Op.getOperand(0), Op.getOperand(1), RHS,
11035 AN->getMemOperand());
11036}
11037
11038SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
11039 SelectionDAG &DAG) const {
11040 auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget());
11041 if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
11042 return SDValue();
11043
11044 // LSE has an atomic load-clear instruction, but not a load-and.
11045 SDLoc dl(Op);
11046 MVT VT = Op.getSimpleValueType();
11047 SDValue RHS = Op.getOperand(2);
11048 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
11049 RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getConstant(-1ULL, dl, VT), RHS);
11050 return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, dl, AN->getMemoryVT(),
11051 Op.getOperand(0), Op.getOperand(1), RHS,
11052 AN->getMemOperand());
11053}
11054
11055SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(
11056 SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const {
11057 SDLoc dl(Op);
11059 SDValue Callee = DAG.getTargetExternalSymbol("__chkstk", PtrVT, 0);
11060
11061 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
11062 const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
11063 if (Subtarget->hasCustomCallingConv())
11064 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
11065
11066 Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size,
11067 DAG.getConstant(4, dl, MVT::i64));
11068 Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue());
11069 Chain =
11071 Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
11072 DAG.getRegisterMask(Mask), Chain.getValue(1));
11073 // To match the actual intent better, we should read the output from X15 here
11074 // again (instead of potentially spilling it to the stack), but rereading Size
11075 // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
11076 // here.
11077
11078 Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
11079 DAG.getConstant(4, dl, MVT::i64));
11080 return Chain;
11081}
11082
11083SDValue
11084AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
11085 SelectionDAG &DAG) const {
11086 assert(Subtarget->isTargetWindows() &&
11087 "Only Windows alloca probing supported");
11088 SDLoc dl(Op);
11089 // Get the inputs.
11090 SDNode *Node = Op.getNode();
11091 SDValue Chain = Op.getOperand(0);
11092 SDValue Size = Op.getOperand(1);
11094 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
11095 EVT VT = Node->getValueType(0);
11096
11098 "no-stack-arg-probe")) {
11099 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
11100 Chain = SP.getValue(1);
11101 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
11102 if (Align)
11103 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
11104 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
11105 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
11106 SDValue Ops[2] = {SP, Chain};
11107 return DAG.getMergeValues(Ops, dl);
11108 }
11109
11110 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
11111
11112 Chain = LowerWindowsDYNAMIC_STACKALLOC(Op, Chain, Size, DAG);
11113
11114 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
11115 Chain = SP.getValue(1);
11116 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
11117 if (Align)
11118 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
11119 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
11120 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
11121
11122 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
11123 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
11124
11125 SDValue Ops[2] = {SP, Chain};
11126 return DAG.getMergeValues(Ops, dl);
11127}
11128
11129SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
11130 SelectionDAG &DAG) const {
11131 EVT VT = Op.getValueType();
11132 assert(VT != MVT::i64 && "Expected illegal VSCALE node");
11133
11134 SDLoc DL(Op);
11135 APInt MulImm = cast<ConstantSDNode>(Op.getOperand(0))->getAPIntValue();
11136 return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sextOrSelf(64)),
11137 DL, VT);
11138}
11139
11140/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
11141template <unsigned NumVecs>
11142static bool
11145 Info.opc = ISD::INTRINSIC_VOID;
11146 // Retrieve EC from first vector argument.
11147 const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType());
11149#ifndef NDEBUG
11150 // Check the assumption that all input vectors are the same type.
11151 for (unsigned I = 0; I < NumVecs; ++I)
11152 assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
11153 "Invalid type.");
11154#endif
11155 // memVT is `NumVecs * VT`.
11156 Info.memVT = EVT::getVectorVT(CI.getType()->getContext(), VT.getScalarType(),
11157 EC * NumVecs);
11158 Info.ptrVal = CI.getArgOperand(CI.getNumArgOperands() - 1);
11159 Info.offset = 0;
11160 Info.align.reset();
11161 Info.flags = MachineMemOperand::MOStore;
11162 return true;
11163}
11164
11165/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
11166/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
11167/// specified in the intrinsic calls.
11169 const CallInst &I,
11170 MachineFunction &MF,
11171 unsigned Intrinsic) const {
11172 auto &DL = I.getModule()->getDataLayout();
11173 switch (Intrinsic) {
11174 case Intrinsic::aarch64_sve_st2:
11175 return setInfoSVEStN<2>(*this, DL, Info, I);
11176 case Intrinsic::aarch64_sve_st3:
11177 return setInfoSVEStN<3>(*this, DL, Info, I);
11178 case Intrinsic::aarch64_sve_st4:
11179 return setInfoSVEStN<4>(*this, DL, Info, I);
11180 case Intrinsic::aarch64_neon_ld2:
11181 case Intrinsic::aarch64_neon_ld3:
11182 case Intrinsic::aarch64_neon_ld4:
11183 case Intrinsic::aarch64_neon_ld1x2:
11184 case Intrinsic::aarch64_neon_ld1x3:
11185 case Intrinsic::aarch64_neon_ld1x4:
11186 case Intrinsic::aarch64_neon_ld2lane:
11187 case Intrinsic::aarch64_neon_ld3lane:
11188 case Intrinsic::aarch64_neon_ld4lane:
11189 case Intrinsic::aarch64_neon_ld2r:
11190 case Intrinsic::aarch64_neon_ld3r:
11191 case Intrinsic::aarch64_neon_ld4r: {
11192 Info.opc = ISD::INTRINSIC_W_CHAIN;
11193 // Conservatively set memVT to the entire set of vectors loaded.
11194 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
11195 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
11196 Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
11197 Info.offset = 0;
11198 Info.align.reset();
11199 // volatile loads with NEON intrinsics not supported
11200 Info.flags = MachineMemOperand::MOLoad;
11201 return true;
11202 }
11203 case Intrinsic::aarch64_neon_st2:
11204 case Intrinsic::aarch64_neon_st3:
11205 case Intrinsic::aarch64_neon_st4:
11206 case Intrinsic::aarch64_neon_st1x2:
11207 case Intrinsic::aarch64_neon_st1x3:
11208 case Intrinsic::aarch64_neon_st1x4:
11209 case Intrinsic::aarch64_neon_st2lane:
11210 case Intrinsic::aarch64_neon_st3lane:
11211 case Intrinsic::aarch64_neon_st4lane: {
11212 Info.opc = ISD::INTRINSIC_VOID;
11213 // Conservatively set memVT to the entire set of vectors stored.
11214 unsigned NumElts = 0;
11215 for (unsigned ArgI = 0, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
11216 Type *ArgTy = I.getArgOperand(ArgI)->getType();
11217 if (!ArgTy->isVectorTy())
11218 break;
11219 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
11220 }
11221 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
11222 Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
11223 Info.offset = 0;
11224 Info.align.reset();
11225 // volatile stores with NEON intrinsics not supported
11226 Info.flags = MachineMemOperand::MOStore;
11227 return true;
11228 }
11229 case Intrinsic::aarch64_ldaxr:
11230 case Intrinsic::aarch64_ldxr: {
11231 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
11232 Info.opc = ISD::INTRINSIC_W_CHAIN;
11233 Info.memVT = MVT::getVT(PtrTy->getElementType());
11234 Info.ptrVal = I.getArgOperand(0);
11235 Info.offset = 0;
11236 Info.align = DL.getABITypeAlign(PtrTy->getElementType());
11238 return true;
11239 }
11240 case Intrinsic::aarch64_stlxr:
11241 case Intrinsic::aarch64_stxr: {
11242 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
11243 Info.opc = ISD::INTRINSIC_W_CHAIN;
11244 Info.memVT = MVT::getVT(PtrTy->getElementType());
11245 Info.ptrVal = I.getArgOperand(1);
11246 Info.offset = 0;
11247 Info.align = DL.getABITypeAlign(PtrTy->getElementType());
11249 return true;
11250 }
11251 case Intrinsic::aarch64_ldaxp:
11252 case Intrinsic::aarch64_ldxp:
11253 Info.opc = ISD::INTRINSIC_W_CHAIN;
11254 Info.memVT = MVT::i128;
11255 Info.ptrVal = I.getArgOperand(0);
11256 Info.offset = 0;
11257 Info.align = Align(16);
11259 return true;
11260 case Intrinsic::aarch64_stlxp:
11261 case Intrinsic::aarch64_stxp:
11262 Info.opc = ISD::INTRINSIC_W_CHAIN;
11263 Info.memVT = MVT::i128;
11264 Info.ptrVal = I.getArgOperand(2);
11265 Info.offset = 0;
11266 Info.align = Align(16);
11268 return true;
11269 case Intrinsic::aarch64_sve_ldnt1: {
11270 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
11271 Info.opc = ISD::INTRINSIC_W_CHAIN;
11272 Info.memVT = MVT::getVT(I.getType());
11273 Info.ptrVal = I.getArgOperand(1);
11274 Info.offset = 0;
11275 Info.align = DL.getABITypeAlign(PtrTy->getElementType());
11276 Info.flags = MachineMemOperand::MOLoad;
11277 if (Intrinsic == Intrinsic::aarch64_sve_ldnt1)
11279 return true;
11280 }
11281 case Intrinsic::aarch64_sve_stnt1: {
11282 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(2)->getType());
11283 Info.opc = ISD::INTRINSIC_W_CHAIN;
11284 Info.memVT = MVT::getVT(I.getOperand(0)->getType());
11285 Info.ptrVal = I.getArgOperand(2);
11286 Info.offset = 0;
11287 Info.align = DL.getABITypeAlign(PtrTy->getElementType());
11288 Info.flags = MachineMemOperand::MOStore;
11289 if (Intrinsic == Intrinsic::aarch64_sve_stnt1)
11291 return true;
11292 }
11293 default:
11294 break;
11295 }
11296
11297 return false;
11298}
11299
11301 ISD::LoadExtType ExtTy,
11302 EVT NewVT) const {
11303 // TODO: This may be worth removing. Check regression tests for diffs.
11305 return false;
11306
11307 // If we're reducing the load width in order to avoid having to use an extra
11308 // instruction to do extension then it's probably a good idea.
11309 if (ExtTy != ISD::NON_EXTLOAD)
11310 return true;
11311 // Don't reduce load width if it would prevent us from combining a shift into
11312 // the offset.
11313 MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
11314 assert(Mem);
11315 const SDValue &Base = Mem->getBasePtr();
11316 if (Base.getOpcode() == ISD::ADD &&
11317 Base.getOperand(1).getOpcode() == ISD::SHL &&
11318 Base.getOperand(1).hasOneUse() &&
11319 Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
11320 // The shift can be combined if it matches the size of the value being
11321 // loaded (and so reducing the width would make it not match).
11322 uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
11323 uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
11324 if (ShiftAmount == Log2_32(LoadBytes))
11325 return false;
11326 }
11327 // We have no reason to disallow reducing the load width, so allow it.
11328 return true;
11329}
11330
11331// Truncations from 64-bit GPR to 32-bit GPR is free.
11333 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
11334 return false;
11335 uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedSize();
11336 uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedSize();
11337 return NumBits1 > NumBits2;
11338}
11340 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
11341 return false;
11342 uint64_t NumBits1 = VT1.getFixedSizeInBits();
11343 uint64_t NumBits2 = VT2.getFixedSizeInBits();
11344 return NumBits1 > NumBits2;
11345}
11346
11347/// Check if it is profitable to hoist instruction in then/else to if.
11348/// Not profitable if I and it's user can form a FMA instruction
11349/// because we prefer FMSUB/FMADD.
11351 if (I->getOpcode() != Instruction::FMul)
11352 return true;
11353
11354 if (!I->hasOneUse())
11355 return true;
11356
11358
11359 if (User &&
11360 !(User->getOpcode() == Instruction::FSub ||
11361 User->getOpcode() == Instruction::FAdd))
11362 return true;
11363
11365 const Function *F = I->getFunction();
11366 const DataLayout &DL = F->getParent()->getDataLayout();
11367 Type *Ty = User->getOperand(0)->getType();
11368
11369 return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
11371 (Options.AllowFPOpFusion == FPOpFusion::Fast ||
11372 Options.UnsafeFPMath));
11373}
11374
11375// All 32-bit GPR operations implicitly zero the high-half of the corresponding
11376// 64-bit GPR.
11378 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
11379 return false;
11380 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
11381 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
11382 return NumBits1 == 32 && NumBits2 == 64;
11383}
11385 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
11386 return false;
11387 unsigned NumBits1 = VT1.getSizeInBits();
11388 unsigned NumBits2 = VT2.getSizeInBits();
11389 return NumBits1 == 32 && NumBits2 == 64;
11390}
11391
11393 EVT VT1 = Val.getValueType();
11394 if (isZExtFree(VT1, VT2)) {
11395 return true;
11396 }
11397
11398 if (Val.getOpcode() != ISD::LOAD)
11399 return false;
11400
11401 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
11402 return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
11403 VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
11404 VT1.getSizeInBits() <= 32);
11405}
11406
11407bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
11408 if (isa<FPExtInst>(Ext))
11409 return false;
11410
11411 // Vector types are not free.
11412 if (Ext->getType()->isVectorTy())
11413 return false;
11414
11415 for (const Use &U : Ext->uses()) {
11416 // The extension is free if we can fold it with a left shift in an
11417 // addressing mode or an arithmetic operation: add, sub, and cmp.
11418
11419 // Is there a shift?
11420 const Instruction *Instr = cast<Instruction>(U.getUser());
11421
11422 // Is this a constant shift?
11423 switch (Instr->getOpcode()) {
11424 case Instruction::Shl:
11425 if (!isa<ConstantInt>(Instr->getOperand(1)))
11426 return false;
11427 break;
11430 auto &DL = Ext->getModule()->getDataLayout();
11431 std::advance(GTI, U.getOperandNo()-1);
11432 Type *IdxTy = GTI.getIndexedType();
11433 // This extension will end up with a shift because of the scaling factor.
11434 // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
11435 // Get the shift amount based on the scaling factor:
11436 // log2(sizeof(IdxTy)) - log2(8).
11437 uint64_t ShiftAmt =
11438 countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy).getFixedSize()) - 3;
11439 // Is the constant foldable in the shift of the addressing mode?
11440 // I.e., shift amount is between 1 and 4 inclusive.
11441 if (ShiftAmt == 0 || ShiftAmt > 4)
11442 return false;
11443 break;
11444 }
11445 case Instruction::Trunc:
11446 // Check if this is a noop.
11447 // trunc(sext ty1 to ty2) to ty1.
11448 if (Instr->getType() == Ext->getOperand(0)->getType())
11449 continue;
11451 default:
11452 return false;
11453 }
11454
11455 // At this point we can use the bfm family, so this extension is free
11456 // for that use.
11457 }
11458 return true;
11459}
11460
11461/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
11462/// or upper half of the vector elements.
11463static bool areExtractShuffleVectors(Value *Op1, Value *Op2) {
11464 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
11465 auto *FullTy = FullV->getType();
11466 auto *HalfTy = HalfV->getType();
11467 return FullTy->getPrimitiveSizeInBits().getFixedSize() ==
11468 2 * HalfTy->getPrimitiveSizeInBits().getFixedSize();
11469 };
11470
11471 auto extractHalf = [](Value *FullV, Value *HalfV) {
11472 auto *FullVT = cast<FixedVectorType>(FullV->getType());
11473 auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
11474 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
11475 };
11476
11478 Value *S1Op1, *S2Op1;
11479 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
11481 return false;
11482
11483 // Check that the operands are half as wide as the result and we extract
11484 // half of the elements of the input vectors.
11485 if (!areTypesHalfed(S1Op1, Op1) || !areTypesHalfed(S2Op1, Op2) ||
11486 !extractHalf(S1Op1, Op1) || !extractHalf(S2Op1, Op2))
11487 return false;
11488
11489 // Check the mask extracts either the lower or upper half of vector
11490 // elements.
11491 int M1Start = -1;
11492 int M2Start = -1;
11493 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
11496 M1Start != M2Start || (M1Start != 0 && M2Start != (NumElements / 2)))
11497 return false;
11498
11499 return true;
11500}
11501
11502/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
11503/// of the vector elements.
11505 auto areExtDoubled = [](Instruction *Ext) {
11506 return Ext->getType()->getScalarSizeInBits() ==
11507 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
11508 };
11509
11510 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
11514 return false;
11515
11516 return true;
11517}
11518
11519/// Check if Op could be used with vmull_high_p64 intrinsic.
11521 Value *VectorOperand = nullptr;
11522 ConstantInt *ElementIndex = nullptr;
11524 m_ConstantInt(ElementIndex))) &&
11525 ElementIndex->getValue() == 1 &&
11526 isa<FixedVectorType>(VectorOperand->getType()) &&
11527 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
11528}
11529
11530/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
11531static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
11533}
11534
11535/// Check if sinking \p I's operands to I's basic block is profitable, because
11536/// the operands can be folded into a target instruction, e.g.
11537/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
11539 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
11540 if (!I->getType()->isVectorTy())
11541 return false;
11542
11544 switch (II->getIntrinsicID()) {
11545 case Intrinsic::aarch64_neon_umull:
11546 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
11547 return false;
11548 Ops.push_back(&II->getOperandUse(0));
11549 Ops.push_back(&II->getOperandUse(1));
11550 return true;
11551
11552 case Intrinsic::aarch64_neon_pmull64:
11553 if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
11554 II->getArgOperand(1)))
11555 return false;
11556 Ops.push_back(&II->getArgOperandUse(0));
11557 Ops.push_back(&II->getArgOperandUse(1));
11558 return true;
11559
11560 default:
11561 return false;
11562 }
11563 }
11564
11565 switch (I->getOpcode()) {
11566 case Instruction::Sub:
11567 case Instruction::Add: {
11568 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
11569 return false;
11570
11571 // If the exts' operands extract either the lower or upper elements, we
11572 // can sink them too.
11573 auto Ext1 = cast<Instruction>(I->getOperand(0));
11574 auto Ext2 = cast<Instruction>(I->getOperand(1));
11576 Ops.push_back(&Ext1->getOperandUse(0));
11577 Ops.push_back(&Ext2->getOperandUse(0));
11578 }
11579
11580 Ops.push_back(&I->getOperandUse(0));
11581 Ops.push_back(&I->getOperandUse(1));
11582
11583 return true;
11584 }
11585 case Instruction::Mul: {
11586 bool IsProfitable = false;
11587 for (auto &Op : I->operands()) {
11588 // Make sure we are not already sinking this operand
11589 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
11590 continue;
11591
11593 if (!Shuffle || !Shuffle->isZeroEltSplat())
11594 continue;
11595
11596 Value *ShuffleOperand = Shuffle->getOperand(0);
11598 if (!Insert)
11599 continue;
11600
11601 Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
11602 if (!OperandInstr)
11603 continue;
11604
11606 dyn_cast<ConstantInt>(Insert->getOperand(2));
11607 // Check that the insertelement is inserting into element 0
11608 if (!ElementConstant || ElementConstant->getZExtValue() != 0)
11609 continue;
11610
11611 unsigned Opcode = OperandInstr->getOpcode();
11612 if (Opcode != Instruction::SExt && Opcode != Instruction::ZExt)
11613 continue;
11614
11615 Ops.push_back(&Shuffle->getOperandUse(0));
11616 Ops.push_back(&Op);
11617 IsProfitable = true;
11618 }
11619
11620 return IsProfitable;
11621 }
11622 default:
11623 return false;
11624 }
11625 return false;
11626}
11627
11629 Align &RequiredAligment) const {
11630 if (!LoadedType.isSimple() ||
11631 (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
11632 return false;
11633 // Cyclone supports unaligned accesses.
11635 unsigned NumBits = LoadedType.getSizeInBits();
11636 return NumBits == 32 || NumBits == 64;
11637}
11638
11639/// A helper function for determining the number of interleaved accesses we
11640/// will generate when lowering accesses of the given type.
11641unsigned
11643 const DataLayout &DL) const {
11644 return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
11645}
11646
11649 if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
11650 I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr)
11651 return MOStridedAccess;
11653}
11654
11656 VectorType *VecTy, const DataLayout &DL) const {
11657
11658 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
11659 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
11660
11661 // Ensure the number of vector elements is greater than 1.
11662 if (cast<FixedVectorType>(VecTy)->getNumElements() < 2)
11663 return false;
11664
11665 // Ensure the element type is legal.
11666 if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
11667 return false;
11668
11669 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
11670 // 128 will be split into multiple interleaved accesses.
11671 return VecSize == 64 || VecSize % 128 == 0;
11672}
11673
11674/// Lower an interleaved load into a ldN intrinsic.
11675///
11676/// E.g. Lower an interleaved load (Factor = 2):
11677/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
11678/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
11679/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
11680///
11681/// Into:
11682/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
11683/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
11684/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
11687 ArrayRef<unsigned> Indices, unsigned Factor) const {
11688 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
11689 "Invalid interleave factor");
11690 assert(!Shuffles.empty() && "Empty shufflevector input");
11691 assert(Shuffles.size() == Indices.size() &&
11692 "Unmatched number of shufflevectors and indices");
11693
11694 const DataLayout &DL = LI->getModule()->getDataLayout();
11695
11696 VectorType *VTy = Shuffles[0]->getType();
11697
11698 // Skip if we do not have NEON and skip illegal vector types. We can
11699 // "legalize" wide vector types into multiple interleaved accesses as long as
11700 // the vector types are divisible by 128.
11701 if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VTy, DL))
11702 return false;
11703
11704 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL);
11705
11706 auto *FVTy = cast<FixedVectorType>(VTy);
11707
11708 // A pointer vector can not be the return type of the ldN intrinsics. Need to
11709 // load integer vectors first and then convert to pointer vectors.
11710 Type *EltTy = FVTy->getElementType();
11711 if (EltTy->isPointerTy())
11712 FVTy =
11713 FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());
11714
11715 IRBuilder<> Builder(LI);
11716
11717 // The base address of the load.
11718 Value *BaseAddr = LI->getPointerOperand();
11719
11720 if (NumLoads > 1) {
11721 // If we're going to generate more than one load, reset the sub-vector type
11722 // to something legal.
11723 FVTy = FixedVectorType::get(FVTy->getElementType(),
11724 FVTy->getNumElements() / NumLoads);
11725
11726 // We will compute the pointer operand of each load from the original base
11727 // address using GEPs. Cast the base address to a pointer to the scalar
11728 // element type.
11729 BaseAddr = Builder.CreateBitCast(
11730 BaseAddr,
11731 FVTy->getElementType()->getPointerTo(LI->getPointerAddressSpace()));
11732 }
11733
11734 Type *PtrTy = FVTy->getPointerTo(LI->getPointerAddressSpace());
11735 Type *Tys[2] = {FVTy, PtrTy};
11736 static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2,
11737 Intrinsic::aarch64_neon_ld3,
11738 Intrinsic::aarch64_neon_ld4};
11739 Function *LdNFunc =
11740 Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
11741
11742 // Holds sub-vectors extracted from the load intrinsic return values. The
11743 // sub-vectors are associated with the shufflevector instructions they will
11744 // replace.
11746
11747 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
11748
11749 // If we're generating more than one load, compute the base address of
11750 // subsequent loads as an offset from the previous.
11751 if (LoadCount > 0)
11752 BaseAddr = Builder.CreateConstGEP1_32(FVTy->getElementType(), BaseAddr,
11753 FVTy->getNumElements() * Factor);
11754
11755 CallInst *LdN = Builder.CreateCall(
11756 LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy), "ldN");
11757
11758 // Extract and store the sub-vectors returned by the load intrinsic.
11759 for (unsigned i = 0; i < Shuffles.size(); i++) {
11760 ShuffleVectorInst *SVI = Shuffles[i];
11761 unsigned Index = Indices[i];
11762
11763 Value *SubVec = Builder.CreateExtractValue(LdN, Index);
11764
11765 // Convert the integer vector to pointer vector if the element is pointer.
11766 if (EltTy->isPointerTy())
11767 SubVec = Builder.CreateIntToPtr(
11769 FVTy->getNumElements()));
11770 SubVecs[SVI].push_back(SubVec);
11771 }
11772 }
11773
11774 // Replace uses of the shufflevector instructions with the sub-vectors
11775 // returned by the load intrinsic. If a shufflevector instruction is
11776 // associated with more than one sub-vector, those sub-vectors will be
11777 // concatenated into a single wide vector.
11778 for (ShuffleVectorInst *SVI : Shuffles) {
11779 auto &SubVec = SubVecs[SVI];
11780 auto *WideVec =
11781 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
11782 SVI->replaceAllUsesWith(WideVec);
11783 }
11784
11785 return true;
11786}
11787
11788/// Lower an interleaved store into a stN intrinsic.
11789///
11790/// E.g. Lower an interleaved store (Factor = 3):
11791/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
11792/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
11793/// store <12 x i32> %i.vec, <12 x i32>* %ptr
11794///
11795/// Into:
11796/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
11797/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
11798/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
11799/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
11800///
11801/// Note that the new shufflevectors will be removed and we'll only generate one
11802/// st3 instruction in CodeGen.
11803///
11804/// Example for a more general valid mask (Factor 3). Lower:
11805/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
11806/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
11807/// store <12 x i32> %i.vec, <12 x i32>* %ptr
11808///
11809/// Into:
11810/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
11811/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
11812/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
11813/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
11815 ShuffleVectorInst *SVI,
11816 unsigned Factor) const {
11817 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
11818 "Invalid interleave factor");
11819
11820 auto *VecTy = cast<FixedVectorType>(SVI->getType());
11821 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
11822
11823 unsigned LaneLen = VecTy->getNumElements() / Factor;
11824 Type *EltTy = VecTy->getElementType();
11825 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
11826
11827 const DataLayout &DL = SI->getModule()->getDataLayout();
11828
11829 // Skip if we do not have NEON and skip illegal vector types. We can
11830 // "legalize" wide vector types into multiple interleaved accesses as long as
11831 // the vector types are divisible by 128.
11832 if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(SubVecTy, DL))
11833 return false;
11834
11835 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
11836
11837 Value *Op0 = SVI->getOperand(0);
11838 Value *Op1 = SVI->getOperand(1);
11840
11841 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
11842 // vectors to integer vectors.
11843 if (EltTy->isPointerTy()) {
11844 Type *IntTy = DL.getIntPtrType(EltTy);
11845 unsigned NumOpElts =
11846 cast<FixedVectorType>(Op0->getType())->getNumElements();
11847
11848 // Convert to the corresponding integer vector.
11849 auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
11850 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
11851 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
11852
11854 }
11855
11856 // The base address of the store.
11857 Value *BaseAddr = SI->getPointerOperand();
11858
11859 if (NumStores > 1) {
11860 // If we're going to generate more than one store, reset the lane length
11861 // and sub-vector type to something legal.
11862 LaneLen /= NumStores;
11863 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
11864
11865 // We will compute the pointer operand of each store from the original base
11866 // address using GEPs. Cast the base address to a pointer to the scalar
11867 // element type.
11868 BaseAddr = Builder.CreateBitCast(
11869 BaseAddr,
11870 SubVecTy->getElementType()->getPointerTo(SI->getPointerAddressSpace()));
11871 }
11872
11873 auto Mask = SVI->getShuffleMask();
11874
11875 Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace());
11876 Type *Tys[2] = {SubVecTy, PtrTy};
11877 static const Intrinsic::ID StoreInts[3] = {Intrinsic::aarch64_neon_st2,
11878 Intrinsic::aarch64_neon_st3,
11879 Intrinsic::aarch64_neon_st4};
11880 Function *StNFunc =
11881 Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys);
11882
11883 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
11884
11886
11887 // Split the shufflevector operands into sub vectors for the new stN call.
11888 for (unsigned i = 0; i < Factor; i++) {
11889 unsigned IdxI = StoreCount * LaneLen * Factor + i;
11890 if (Mask[IdxI] >= 0) {
11891 Ops.push_back(Builder.CreateShuffleVector(
11892 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0)));
11893 } else {
11894 unsigned StartMask = 0;
11895 for (unsigned j = 1; j < LaneLen; j++) {
11896 unsigned IdxJ = StoreCount * LaneLen * Factor + j;
11897 if (Mask[IdxJ * Factor + IdxI] >= 0) {
11898 StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
11899 break;
11900 }
11901 }
11902 // Note: Filling undef gaps with random elements is ok, since
11903 // those elements were being written anyway (with undefs).
11904 // In the case of all undefs we're defaulting to using elems from 0
11905 // Note: StartMask cannot be negative, it's checked in
11906 // isReInterleaveMask
11907 Ops.push_back(Builder.CreateShuffleVector(
11908 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0)));
11909 }
11910 }
11911
11912 // If we generating more than one store, we compute the base address of
11913 // subsequent stores as an offset from the previous.
11914 if (StoreCount > 0)
11915 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
11916 BaseAddr, LaneLen * Factor);
11917
11918 Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy));
11919 Builder.CreateCall(StNFunc, Ops);
11920 }
11921 return true;
11922}
11923
11924// Lower an SVE structured load intrinsic returning a tuple type to target
11925// specific intrinsic taking the same input but returning a multi-result value
11926// of the split tuple type.
11927//
11928// E.g. Lowering an LD3:
11929//
11930// call <vscale x 12 x i32> @llvm.aarch64.sve.ld3.nxv12i32(
11931// <vscale x 4 x i1> %pred,
11932// <vscale x 4 x i32>* %addr)
11933//
11934// Output DAG:
11935//
11936// t0: ch = EntryToken
11937// t2: nxv4i1,ch = CopyFromReg t0, Register:nxv4i1 %0
11938// t4: i64,ch = CopyFromReg t0, Register:i64 %1
11939// t5: nxv4i32,nxv4i32,nxv4i32,ch = AArch64ISD::SVE_LD3 t0, t2, t4
11940// t6: nxv12i32 = concat_vectors t5, t5:1, t5:2
11941//
11942// This is called pre-legalization to avoid widening/splitting issues with
11943// non-power-of-2 tuple types used for LD3, such as nxv12i32.
11944SDValue AArch64TargetLowering::LowerSVEStructLoad(unsigned Intrinsic,
11946 EVT VT, SelectionDAG &DAG,
11947 const SDLoc &DL) const {
11948 assert(VT.isScalableVector() && "Can only lower scalable vectors");
11949
11950 unsigned N, Opcode;
11951 static std::map<unsigned, std::pair<unsigned, unsigned>> IntrinsicMap = {
11952 {Intrinsic::aarch64_sve_ld2, {2, AArch64ISD::SVE_LD2_MERGE_ZERO}},
11953 {Intrinsic::aarch64_sve_ld3, {3, AArch64ISD::SVE_LD3_MERGE_ZERO}},
11954 {Intrinsic::aarch64_sve_ld4, {4, AArch64ISD::SVE_LD4_MERGE_ZERO}}};
11955
11956 std::tie(N, Opcode) = IntrinsicMap[Intrinsic];
11958 "invalid tuple vector type!");
11959
11960 EVT SplitVT =
11964
11966 VTs.push_back(MVT::Other); // Chain
11967 SDVTList NodeTys = DAG.getVTList(VTs);
11968
11969 SDValue PseudoLoad = DAG.getNode(Opcode, DL, NodeTys, LoadOps);
11971 for (unsigned I = 0; I < N; ++I)
11972 PseudoLoadOps.push_back(SDValue(PseudoLoad.getNode(), I));
11973 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, PseudoLoadOps);
11974}
11975
11977 const MemOp &Op, const AttributeList &FuncAttributes) const {
11978 bool CanImplicitFloat =
11979 !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat);
11980 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
11981 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
11982 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
11983 // taken one instruction to materialize the v2i64 zero and one store (with
11984 // restrictive addressing mode). Just do i64 stores.
11985 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
11986 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
11987 if (Op.isAligned(AlignCheck))
11988 return true;
11989 bool Fast;
11990 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
11992 Fast;
11993 };
11994
11995 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
11997 return MVT::v2i64;
11999 return MVT::f128;
12000 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
12001 return MVT::i64;
12002 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
12003 return MVT::i32;
12004 return MVT::Other;
12005}
12006
12008 const MemOp &Op, const AttributeList &FuncAttributes) const {
12009 bool CanImplicitFloat =
12010 !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat);
12011 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
12012 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
12013 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
12014 // taken one instruction to materialize the v2i64 zero and one store (with
12015 // restrictive addressing mode). Just do i64 stores.
12016 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
12017 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
12018 if (Op.isAligned(AlignCheck))
12019 return true;
12020 bool Fast;
12021 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
12023 Fast;
12024 };
12025
12026 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
12028 return LLT::fixed_vector(2, 64);
12030 return LLT::scalar(128);
12031 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
12032 return LLT::scalar(64);
12033 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
12034 return LLT::scalar(32);
12035 return LLT();
12036}
12037
12038// 12-bit optionally shifted immediates are legal for adds.
12040 if (Immed == std::numeric_limits<int64_t>::min()) {
12041 LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed
12042 << ": avoid UB for INT64_MIN\n");
12043 return false;
12044 }
12045 // Same encoding for add/sub, just flip the sign.
12046 Immed = std::abs(Immed);
12047 bool IsLegal = ((Immed >> 12) == 0 ||
12048 ((Immed & 0xfff) == 0 && Immed >> 24 == 0));
12049 LLVM_DEBUG(dbgs() << "Is " << Immed
12050 << " legal add imm: " << (IsLegal ? "yes" : "no") << "\n");
12051 return IsLegal;
12052}
12053
12054// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
12055// immediates is the same as for an add or a sub.
12059
12060/// isLegalAddressingMode - Return true if the addressing mode represented
12061/// by AM is legal for this target, for a load/store of the specified type.
12063 const AddrMode &AM, Type *Ty,
12064 unsigned AS, Instruction *I) const {
12065 // AArch64 has five basic addressing modes:
12066 // reg
12067 // reg + 9-bit signed offset
12068 // reg + SIZE_IN_BYTES * 12-bit unsigned offset
12069 // reg1 + reg2
12070 // reg + SIZE_IN_BYTES * reg
12071
12072 // No global is ever allowed as a base.
12073 if (AM.BaseGV)
12074 return false;
12075
12076 // No reg+reg+imm addressing.
12077 if (AM.HasBaseReg && AM.BaseOffs && AM.Scale)
12078 return false;
12079
12080 // FIXME: Update this method to support scalable addressing modes.
12081 if (isa<ScalableVectorType>(Ty)) {
12082 uint64_t VecElemNumBytes =
12083 DL.getTypeSizeInBits(cast<VectorType>(Ty)->getElementType()) / 8;
12084 return AM.HasBaseReg && !AM.BaseOffs &&
12085 (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes);
12086 }
12087
12088 // check reg + imm case:
12089 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
12090 uint64_t NumBytes = 0;
12091 if (Ty->isSized()) {
12092 uint64_t NumBits = DL.getTypeSizeInBits(Ty);
12093 NumBytes = NumBits / 8;
12094 if (!isPowerOf2_64(NumBits))
12095 NumBytes = 0;
12096 }
12097
12098 if (!AM.Scale) {
12099 int64_t Offset = AM.BaseOffs;
12100
12101 // 9-bit signed offset
12102 if (isInt<9>(Offset))
12103 return true;
12104
12105 // 12-bit unsigned offset
12106 unsigned shift = Log2_64(NumBytes);
12107 if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
12108 // Must be a multiple of NumBytes (NumBytes is a power of 2)
12109 (Offset >> shift) << shift == Offset)
12110 return true;
12111 return false;
12112 }
12113
12114 // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
12115
12116 return AM.Scale == 1 || (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes);
12117}
12118
12120 // Consider splitting large offset of struct or array.
12121 return true;
12122}
12123
12125 const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const {
12126 // Scaling factors are not free at all.
12127 // Operands | Rt Latency
12128 // -------------------------------------------
12129 // Rt, [Xn, Xm] | 4
12130 // -------------------------------------------
12131 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
12132 // Rt, [Xn, Wm, <extend> #imm] |
12133 if (isLegalAddressingMode(DL, AM, Ty, AS))
12134 // Scale represents reg2 * scale, thus account for 1 if
12135 // it is not equal to 0 or 1.
12136 return AM.Scale != 0 && AM.Scale != 1;
12137 return -1;
12138}
12139
12141 const MachineFunction &MF, EVT VT) const {
12142 VT = VT.getScalarType();
12143
12144 if (!VT.isSimple())
12145 return false;
12146
12147 switch (VT.getSimpleVT().SimpleTy) {
12148 case MVT::f16:
12149 return Subtarget->hasFullFP16();
12150 case MVT::f32:
12151 case MVT::f64:
12152 return true;
12153 default:
12154 break;
12155 }
12156
12157 return false;
12158}
12159
12161 Type *Ty) const {
12162 switch (Ty->getScalarType()->getTypeID()) {
12163 case Type::FloatTyID:
12164 case Type::DoubleTyID:
12165 return true;
12166 default:
12167 return false;
12168 }
12169}
12170
12172 EVT VT, CodeGenOpt::Level OptLevel) const {
12173 return (OptLevel >= CodeGenOpt::Aggressive) && !VT.isScalableVector();
12174}
12175
12176const MCPhysReg *
12178 // LR is a callee-save register, but we must treat it as clobbered by any call
12179 // site. Hence we include LR in the scratch registers, which are in turn added
12180 // as implicit-defs for stackmaps and patchpoints.
12181 static const MCPhysReg ScratchRegs[] = {
12182 AArch64::X16, AArch64::X17, AArch64::LR, 0
12183 };
12184 return ScratchRegs;
12185}
12186
12187bool
12189 CombineLevel Level) const {
12190 N = N->getOperand(0).getNode();
12191 EVT VT = N->getValueType(0);
12192 // If N is unsigned bit extraction: ((x >> C) & mask), then do not combine
12193 // it with shift to let it be lowered to UBFX.
12194 if (N->getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
12195 isa<ConstantSDNode>(N->getOperand(1))) {
12196 uint64_t TruncMask = N->getConstantOperandVal(1);
12197 if (isMask_64(TruncMask) &&
12198 N->getOperand(0).getOpcode() == ISD::SRL &&
12199 isa<ConstantSDNode>(N->getOperand(0)->getOperand(1)))
12200 return false;
12201 }
12202 return true;
12203}
12204
12206 Type *Ty) const {
12207 assert(Ty->isIntegerTy());
12208
12209 unsigned BitSize = Ty->getPrimitiveSizeInBits();
12210 if (BitSize == 0)
12211 return false;
12212
12213 int64_t Val = Imm.getSExtValue();
12214 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
12215 return true;
12216
12217 if ((int64_t)Val < 0)
12218 Val = ~Val;
12219 if (BitSize == 32)
12220 Val &= (1LL << 32) - 1;
12221
12222 unsigned LZ = countLeadingZeros((uint64_t)Val);
12223 unsigned Shift = (63 - LZ) / 16;
12224 // MOVZ is free so return true for one or fewer MOVK.
12225 return Shift < 3;
12226}
12227
12229 unsigned Index) const {
12231 return false;
12232
12233 return (Index == 0 || Index == ResVT.getVectorNumElements());
12234}
12235
12236/// Turn vector tests of the signbit in the form of:
12237/// xor (sra X, elt_size(X)-1), -1
12238/// into:
12239/// cmge X, X, #0
12241 const AArch64Subtarget *Subtarget) {
12242 EVT VT = N->getValueType(0);
12243 if (!Subtarget->hasNEON() || !VT.isVector())
12244 return SDValue();
12245
12246 // There must be a shift right algebraic before the xor, and the xor must be a
12247 // 'not' operation.
12248 SDValue Shift = N->getOperand(0);
12249 SDValue Ones = N->getOperand(1);
12250 if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
12251 !ISD::isBuildVectorAllOnes(Ones.getNode()))
12252 return SDValue();
12253
12254 // The shift should be smearing the sign bit across each vector element.
12255 auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
12257 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
12258 return SDValue();
12259
12260 return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
12261}
12262
12263// Given a vecreduce_add node, detect the below pattern and convert it to the
12264// node sequence with UABDL, [S|U]ADB and UADDLP.
12265//
12266// i32 vecreduce_add(
12267// v16i32 abs(
12268// v16i32 sub(
12269// v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b))))
12270// =================>
12271// i32 vecreduce_add(
12272// v4i32 UADDLP(
12273// v8i16 add(
12274// v8i16 zext(
12275// v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b
12276// v8i16 zext(
12277// v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b
12279 SelectionDAG &DAG) {
12280 // Assumed i32 vecreduce_add
12281 if (N->getValueType(0) != MVT::i32)
12282 return SDValue();
12283
12284 SDValue VecReduceOp0 = N->getOperand(0);
12285 unsigned Opcode = VecReduceOp0.getOpcode();
12286 // Assumed v16i32 abs
12287 if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != MVT::v16i32)
12288 return SDValue();
12289
12290 SDValue ABS = VecReduceOp0;
12291 // Assumed v16i32 sub
12292 if (ABS->getOperand(0)->getOpcode() != ISD::SUB ||
12293 ABS->getOperand(0)->getValueType(0) != MVT::v16i32)
12294 return SDValue();
12295
12296 SDValue SUB = ABS->getOperand(0);
12297 unsigned Opcode0 = SUB->getOperand(0).getOpcode();
12298 unsigned Opcode1 = SUB->getOperand(1).getOpcode();
12299 // Assumed v16i32 type
12300 if (SUB->getOperand(0)->getValueType(0) != MVT::v16i32 ||
12301 SUB->getOperand(1)->getValueType(0) != MVT::v16i32)
12302 return SDValue();
12303
12304 // Assumed zext or sext
12305 bool IsZExt = false;
12307 IsZExt = true;
12308 } else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) {
12309 IsZExt = false;
12310 } else
12311 return SDValue();
12312
12313 SDValue EXT0 = SUB->getOperand(0);
12314 SDValue EXT1 = SUB->getOperand(1);
12315 // Assumed zext's operand has v16i8 type
12316 if (EXT0->getOperand(0)->getValueType(0) != MVT::v16i8 ||
12317 EXT1->getOperand(0)->getValueType(0) != MVT::v16i8)
12318 return SDValue();
12319
12320 // Pattern is dectected. Let's convert it to sequence of nodes.
12321 SDLoc DL(N);
12322
12323 // First, create the node pattern of UABD/SABD.
12325 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
12326 DAG.getConstant(8, DL, MVT::i64));
12328 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
12329 DAG.getConstant(8, DL, MVT::i64));
12333
12334 // Second, create the node pattern of UABAL.
12336 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
12337 DAG.getConstant(0, DL, MVT::i64));
12339 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
12340 DAG.getConstant(0, DL, MVT::i64));
12345
12346 // Third, create the node of UADDLP.
12348
12349 // Fourth, create the node of VECREDUCE_ADD.
12350 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);
12351}
12352
12353// Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
12354// vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
12355// vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
12357 const AArch64Subtarget *ST) {
12358 if (!ST->hasDotProd())
12360
12361 SDValue Op0 = N->getOperand(0);
12362 if (N->getValueType(0) != MVT::i32 ||
12364 return SDValue();
12365
12366 unsigned ExtOpcode = Op0.getOpcode();
12367 SDValue A = Op0;
12368 SDValue B;
12369 if (ExtOpcode == ISD::MUL) {
12370 A = Op0.getOperand(0);
12371 B = Op0.getOperand(1);
12372 if (A.getOpcode() != B.getOpcode() ||
12373 A.getOperand(0).getValueType() != B.getOperand(0).getValueType())
12374 return SDValue();
12375 ExtOpcode = A.getOpcode();
12376 }
12378 return SDValue();
12379
12380 EVT Op0VT = A.getOperand(0).getValueType();
12381 if (Op0VT != MVT::v8i8 && Op0VT != MVT::v16i8)
12382 return SDValue();
12383
12384 SDLoc DL(Op0);
12385 // For non-mla reductions B can be set to 1. For MLA we take the operand of
12386 // the extend B.
12387 if (!B)
12388 B = DAG.getConstant(1, DL, Op0VT);
12389 else
12390 B = B.getOperand(0);
12391
12392 SDValue Zeros =
12394 auto DotOpcode =
12396 SDValue Dot = DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros,
12397 A.getOperand(0), B);
12398 return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
12399}
12400
12403 const AArch64Subtarget *Subtarget) {
12404 if (DCI.isBeforeLegalizeOps())
12405 return SDValue();
12406
12407 return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
12408}
12409
12410SDValue
12411AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
12412 SelectionDAG &DAG,
12415 if (isIntDivCheap(N->getValueType(0), Attr))
12416 return SDValue(N,0); // Lower SDIV as SDIV
12417
12418 // fold (sdiv X, pow2)
12419 EVT VT = N->getValueType(0);
12420 if ((VT != MVT::i32 && VT != MVT::i64) ||
12421 !(Divisor.isPowerOf2() || (-Divisor).isPowerOf2()))
12422 return SDValue();
12423
12424 SDLoc DL(N);
12425 SDValue N0 = N->getOperand(0);
12426 unsigned Lg2 = Divisor.countTrailingZeros();
12427 SDValue Zero = DAG.getConstant(0, DL, VT);
12428 SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
12429
12430 // Add (N0 < 0) ? Pow2 - 1 : 0;
12431 SDValue CCVal;
12432 SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL);
12433 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
12434 SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp);
12435
12436 Created.push_back(Cmp.getNode());
12437 Created.push_back(Add.getNode());
12438 Created.push_back(CSel.getNode());
12439
12440 // Divide by pow2.
12441 SDValue SRA =
12442 DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, DL, MVT::i64));
12443
12444 // If we're dividing by a positive value, we're done. Otherwise, we must
12445 // negate the result.
12446 if (Divisor.isNonNegative())
12447 return SRA;
12448
12449 Created.push_back(SRA.getNode());
12450 return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA);
12451}
12452
12454 switch(getIntrinsicID(S.getNode())) {
12455 default:
12456 break;
12457 case Intrinsic::aarch64_sve_cntb:
12458 case Intrinsic::aarch64_sve_cnth:
12459 case Intrinsic::aarch64_sve_cntw:
12460 case Intrinsic::aarch64_sve_cntd:
12461 return true;
12462 }
12463 return false;
12464}
12465
12466/// Calculates what the pre-extend type is, based on the extension
12467/// operation node provided by \p Extend.
12468///
12469/// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
12470/// pre-extend type is pulled directly from the operand, while other extend
12471/// operations need a bit more inspection to get this information.
12472///
12473/// \param Extend The SDNode from the DAG that represents the extend operation
12474/// \param DAG The SelectionDAG hosting the \p Extend node
12475///
12476/// \returns The type representing the \p Extend source type, or \p MVT::Other
12477/// if no valid type can be determined
12479 switch (Extend.getOpcode()) {
12480 case ISD::SIGN_EXTEND:
12481 case ISD::ZERO_EXTEND:
12482 return Extend.getOperand(0).getValueType();
12483 case ISD::AssertSext:
12484 case ISD::AssertZext:
12486 VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1));
12487 if (!TypeNode)
12488 return MVT::Other;
12489 return TypeNode->getVT();
12490 }
12491 case ISD::AND: {
12493 dyn_cast<ConstantSDNode>(Extend.getOperand(1).getNode());
12494 if (!Constant)
12495 return MVT::Other;
12496
12497 uint32_t Mask = Constant->getZExtValue();
12498
12499 if (Mask == UCHAR_MAX)
12500 return MVT::i8;
12501 else if (Mask == USHRT_MAX)
12502 return MVT::i16;
12503 else if (Mask == UINT_MAX)
12504 return MVT::i32;
12505
12506 return MVT::Other;
12507 }
12508 default:
12509 return MVT::Other;
12510 }
12511
12512 llvm_unreachable("Code path unhandled in calculatePreExtendType!");
12513}
12514
12515/// Combines a dup(sext/zext) node pattern into sext/zext(dup)
12516/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
12518 SelectionDAG &DAG) {
12519
12522 if (!ShuffleNode)
12523 return SDValue();
12524
12525 // Ensuring the mask is zero before continuing
12526 if (!ShuffleNode->isSplat() || ShuffleNode->getSplatIndex() != 0)
12527 return SDValue();
12528
12529 SDValue InsertVectorElt = VectorShuffle.getOperand(0);
12530
12531 if (InsertVectorElt.getOpcode() != ISD::INSERT_VECTOR_ELT)
12532 return SDValue();
12533
12534 SDValue InsertLane = InsertVectorElt.getOperand(2);
12536 // Ensures the insert is inserting into lane 0
12537 if (!Constant || Constant->getZExtValue() != 0)
12538 return SDValue();
12539
12540 SDValue Extend = InsertVectorElt.getOperand(1);
12541 unsigned ExtendOpcode = Extend.getOpcode();
12542
12543 bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
12544 ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
12545 ExtendOpcode == ISD::AssertSext;
12546 if (!IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
12547 ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
12548 return SDValue();
12549
12550 EVT TargetType = VectorShuffle.getValueType();
12552
12553 if ((TargetType != MVT::v8i16 && TargetType != MVT::v4i32 &&
12554 TargetType != MVT::v2i64) ||
12556 return SDValue();
12557
12558 // Restrict valid pre-extend data type
12561 return SDValue();
12562
12564
12565 if (PreExtendVT.getVectorElementCount() != TargetType.getVectorElementCount())
12566 return SDValue();
12567
12568 if (TargetType.getScalarSizeInBits() != PreExtendVT.getScalarSizeInBits() * 2)
12569 return SDValue();
12570
12572
12575 DAG.getAnyExtOrTrunc(Extend.getOperand(0), DL, PreExtendType),
12576 DAG.getConstant(0, DL, MVT::i64));
12577
12578 std::vector<int> ShuffleMask(TargetType.getVectorElementCount().getValue());
12579
12582 DAG.getUNDEF(PreExtendVT), ShuffleMask);
12583
12585 DL, TargetType, VectorShuffleNode);
12586
12587 return ExtendNode;
12588}
12589
12590/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
12591/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
12593 // If the value type isn't a vector, none of the operands are going to be dups
12594 if (!Mul->getValueType(0).isVector())
12595 return SDValue();
12596
12597 SDValue Op0 = performCommonVectorExtendCombine(Mul->getOperand(0), DAG);
12598 SDValue Op1 = performCommonVectorExtendCombine(Mul->getOperand(1), DAG);
12599
12600 // Neither operands have been changed, don't make any further changes
12601 if (!Op0 && !Op1)
12602 return SDValue();
12603
12604 SDLoc DL(Mul);
12605 return DAG.getNode(Mul->getOpcode(), DL, Mul->getValueType(0),
12606 Op0 ? Op0 : Mul->getOperand(0),
12607 Op1 ? Op1 : Mul->getOperand(1));
12608}
12609
12612 const AArch64Subtarget *Subtarget) {
12613
12614 if (SDValue Ext = performMulVectorExtendCombine(N, DAG))
12615 return Ext;
12616
12617 if (DCI.isBeforeLegalizeOps())
12618 return SDValue();
12619
12620 // The below optimizations require a constant RHS.
12621 if (!isa<ConstantSDNode>(N->getOperand(1)))
12622 return SDValue();
12623
12624 SDValue N0 = N->getOperand(0);
12625 ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(1));
12626 const APInt &ConstValue = C->getAPIntValue();
12627
12628 // Allow the scaling to be folded into the `cnt` instruction by preventing
12629 // the scaling to be obscured here. This makes it easier to pattern match.
12630 if (IsSVECntIntrinsic(N0) ||
12631 (N0->getOpcode() == ISD::TRUNCATE &&
12632 (IsSVECntIntrinsic(N0->getOperand(0)))))
12633 if (ConstValue.sge(1) && ConstValue.sle(16))
12634 return SDValue();
12635
12636 // Multiplication of a power of two plus/minus one can be done more
12637 // cheaply as as shift+add/sub. For now, this is true unilaterally. If
12638 // future CPUs have a cheaper MADD instruction, this may need to be
12639 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
12640 // 64-bit is 5 cycles, so this is always a win.
12641 // More aggressively, some multiplications N0 * C can be lowered to
12642 // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
12643 // e.g. 6=3*2=(2+1)*2.
12644 // TODO: consider lowering more cases, e.g. C = 14, -6, -14 or even 45
12645 // which equals to (1+2)*16-(1+2).
12646
12647 // TrailingZeroes is used to test if the mul can be lowered to
12648 // shift+add+shift.
12649 unsigned TrailingZeroes = ConstValue.countTrailingZeros();
12650 if (TrailingZeroes) {
12651 // Conservatively do not lower to shift+add+shift if the mul might be
12652 // folded into smul or umul.
12653 if (N0->hasOneUse() && (isSignExtended(N0.getNode(), DAG) ||
12654 isZeroExtended(N0.getNode(), DAG)))
12655 return SDValue();
12656 // Conservatively do not lower to shift+add+shift if the mul might be
12657 // folded into madd or msub.
12658 if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD ||
12659 N->use_begin()->getOpcode() == ISD::SUB))
12660 return SDValue();
12661 }
12662 // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
12663 // and shift+add+shift.
12665
12666 unsigned ShiftAmt, AddSubOpc;
12667 // Is the shifted value the LHS operand of the add/sub?
12668 bool ShiftValUseIsN0 = true;
12669 // Do we need to negate the result?
12670 bool NegateResult = false;
12671
12672 if (ConstValue.isNonNegative()) {
12673 // (mul x, 2^N + 1) => (add (shl x, N), x)
12674 // (mul x, 2^N - 1) => (sub (shl x, N), x)
12675 // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
12677 APInt CVPlus1 = ConstValue + 1;
12678 if (SCVMinus1.isPowerOf2()) {
12679 ShiftAmt = SCVMinus1.logBase2();
12680 AddSubOpc = ISD::ADD;
12681 } else if (CVPlus1.isPowerOf2()) {
12682 ShiftAmt = CVPlus1.logBase2();
12683 AddSubOpc = ISD::SUB;
12684 } else
12685 return SDValue();
12686 } else {
12687 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
12688 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
12691 if (CVNegPlus1.isPowerOf2()) {
12692 ShiftAmt = CVNegPlus1.logBase2();
12693 AddSubOpc = ISD::SUB;
12694 ShiftValUseIsN0 = false;
12695 } else if (CVNegMinus1.isPowerOf2()) {
12696 ShiftAmt = CVNegMinus1.logBase2();
12697 AddSubOpc = ISD::ADD;
12698 NegateResult = true;
12699 } else
12700 return SDValue();
12701 }
12702
12703 SDLoc DL(N);
12704 EVT VT = N->getValueType(0);
12705 SDValue ShiftedVal = DAG.getNode(ISD::SHL, DL, VT, N0,
12706 DAG.getConstant(ShiftAmt, DL, MVT::i64));
12707
12710 SDValue Res = DAG.getNode(AddSubOpc, DL, VT, AddSubN0, AddSubN1);
12712 "NegateResult and TrailingZeroes cannot both be true for now.");
12713 // Negate the result.
12714 if (NegateResult)
12715 return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res);
12716 // Shift the result.
12717 if (TrailingZeroes)
12718 return DAG.getNode(ISD::SHL, DL, VT, Res,
12720 return Res;
12721}
12722
12724 SelectionDAG &DAG) {
12725 // Take advantage of vector comparisons producing 0 or -1 in each lane to
12726 // optimize away operation when it's from a constant.
12727 //
12728 // The general transformation is:
12729 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
12730 // AND(VECTOR_CMP(x,y), constant2)
12731 // constant2 = UNARYOP(constant)
12732
12733 // Early exit if this isn't a vector operation, the operand of the
12734 // unary operation isn't a bitwise AND, or if the sizes of the operations
12735 // aren't the same.
12736 EVT VT = N->getValueType(0);
12737 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
12738 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
12739 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
12740 return SDValue();
12741
12742 // Now check that the other operand of the AND is a constant. We could
12743 // make the transformation for non-constant splats as well, but it's unclear
12744 // that would be a benefit as it would not eliminate any operations, just
12745 // perform one more step in scalar code before moving to the vector unit.
12746 if (BuildVectorSDNode *BV =
12747 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
12748 // Bail out if the vector isn't a constant.
12749 if (!BV->isConstant())
12750 return SDValue();
12751
12752 // Everything checks out. Build up the new and improved node.
12753 SDLoc DL(N);
12754 EVT IntVT = BV->getValueType(0);
12755 // Create a new constant of the appropriate type for the transformed
12756 // DAG.
12757 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
12758 // The AND node needs bitcasts to/from an integer vector type around it.
12761 N->getOperand(0)->getOperand(0), MaskConst);
12762 SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
12763 return Res;
12764 }
12765
12766 return SDValue();
12767}
12768
12770 const AArch64Subtarget *Subtarget) {
12771 // First try to optimize away the conversion when it's conditionally from
12772 // a constant. Vectors only.
12774 return Res;
12775
12776 EVT VT = N->getValueType(0);
12777 if (VT != MVT::f32 && VT != MVT::f64)
12778 return SDValue();
12779
12780 // Only optimize when the source and destination types have the same width.
12781 if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
12782 return SDValue();
12783
12784 // If the result of an integer load is only used by an integer-to-float
12785 // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
12786 // This eliminates an "integer-to-vector-move" UOP and improves throughput.
12787 SDValue N0 = N->getOperand(0);
12788 if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
12789 // Do not change the width of a volatile load.
12790 !cast<LoadSDNode>(N0)->isVolatile()) {
12792 SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
12793 LN0->getPointerInfo(), LN0->getAlignment(),
12794 LN0->getMemOperand()->getFlags());
12795
12796 // Make sure successors of the original load stay after it by updating them
12797 // to use the new Chain.
12798 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
12799
12800 unsigned Opcode =
12802 return DAG.getNode(Opcode, SDLoc(N), VT, Load);
12803 }
12804
12805 return SDValue();
12806}
12807
12808/// Fold a floating-point multiply by power of two into floating-point to
12809/// fixed-point conversion.
12812 const AArch64Subtarget *Subtarget) {
12813 if (!Subtarget->hasNEON())
12814 return SDValue();
12815
12816 if (!N->getValueType(0).isSimple())
12817 return SDValue();
12818
12819 SDValue Op = N->getOperand(0);
12820 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
12821 Op.getOpcode() != ISD::FMUL)
12822 return SDValue();
12823
12824 SDValue ConstVec = Op->getOperand(1);
12826 return SDValue();
12827
12828 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
12829 uint32_t FloatBits = FloatTy.getSizeInBits();
12830 if (FloatBits != 32 && FloatBits != 64)
12831 return SDValue();
12832
12833 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
12834 uint32_t IntBits = IntTy.getSizeInBits();
12835 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
12836 return SDValue();
12837
12838 // Avoid conversions where iN is larger than the float (e.g., float -> i64).
12839 if (IntBits > FloatBits)
12840 return SDValue();
12841
12844 int32_t Bits = IntBits == 64 ? 64 : 32;
12845 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
12846 if (C == -1 || C == 0 || C > Bits)
12847 return SDValue();
12848
12849 MVT ResTy;
12850 unsigned NumLanes = Op.getValueType().getVectorNumElements();
12851 switch (NumLanes) {
12852 default:
12853 return SDValue();
12854 case 2:
12856 break;
12857 case 4:
12859 break;
12860 }
12861
12862 if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
12863 return SDValue();
12864
12865 assert((ResTy != MVT::v4i64 || DCI.isBeforeLegalizeOps()) &&
12866 "Illegal vector type after legalization");
12867
12868 SDLoc DL(N);
12869 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
12870 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
12871 : Intrinsic::aarch64_neon_vcvtfp2fxu;
12875 Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
12876 // We can handle smaller integers by generating an extra trunc.
12877 if (IntBits < FloatBits)
12878 FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
12879
12880 return FixConv;
12881}
12882
12883/// Fold a floating-point divide by power of two into fixed-point to
12884/// floating-point conversion.
12887 const AArch64Subtarget *Subtarget) {
12888 if (!Subtarget->hasNEON())
12889 return SDValue();
12890
12891 SDValue Op = N->getOperand(0);
12892 unsigned Opc = Op->getOpcode();
12893 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
12894 !Op.getOperand(0).getValueType().isSimple() ||
12895 (Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP))
12896 return SDValue();
12897
12898 SDValue ConstVec = N->getOperand(1);
12900 return SDValue();
12901
12902 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
12903 int32_t IntBits = IntTy.getSizeInBits();
12904 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
12905 return SDValue();
12906
12907 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
12908 int32_t FloatBits = FloatTy.getSizeInBits();
12909 if (FloatBits != 32 && FloatBits != 64)
12910 return SDValue();
12911
12912 // Avoid conversions where iN is larger than the float (e.g., i64 -> float).
12913 if (IntBits > FloatBits)
12914 return SDValue();
12915
12918 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1);
12919 if (C == -1 || C == 0 || C > FloatBits)
12920 return SDValue();
12921
12922 MVT ResTy;
12923 unsigned NumLanes = Op.getValueType().getVectorNumElements();
12924 switch (NumLanes) {
12925 default:
12926 return SDValue();
12927 case 2:
12929 break;
12930 case 4:
12932 break;
12933 }
12934
12935 if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
12936 return SDValue();
12937
12938 SDLoc DL(N);
12939 SDValue ConvInput = Op.getOperand(0);
12940 bool IsSigned = Opc == ISD::SINT_TO_FP;
12941 if (IntBits < FloatBits)
12943 ResTy, ConvInput);
12944
12945 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp
12946 : Intrinsic::aarch64_neon_vcvtfxu2fp;
12947 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
12949 DAG.getConstant(C, DL, MVT::i32));
12950}
12951
12952/// An EXTR instruction is made up of two shifts, ORed together. This helper
12953/// searches for and classifies those shifts.
12954static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
12955 bool &FromHi) {
12956 if (N.getOpcode() == ISD::SHL)
12957 FromHi = false;
12958 else if (N.getOpcode() == ISD::SRL)
12959 FromHi = true;
12960 else
12961 return false;
12962
12963 if (!isa<ConstantSDNode>(N.getOperand(1)))
12964 return false;
12965
12966 ShiftAmount = N->getConstantOperandVal(1);
12967 Src = N->getOperand(0);
12968 return true;
12969}
12970
12971/// EXTR instruction extracts a contiguous chunk of bits from two existing
12972/// registers viewed as a high/low pair. This function looks for the pattern:
12973/// <tt>(or (shl VAL1, \#N), (srl VAL2, \#RegWidth-N))</tt> and replaces it
12974/// with an EXTR. Can't quite be done in TableGen because the two immediates
12975/// aren't independent.
12978 SelectionDAG &DAG = DCI.DAG;
12979 SDLoc DL(N);
12980 EVT VT = N->getValueType(0);
12981
12982 assert(N->getOpcode() == ISD::OR && "Unexpected root");
12983
12984 if (VT != MVT::i32 && VT != MVT::i64)
12985 return SDValue();
12986
12987 SDValue LHS;
12988 uint32_t ShiftLHS = 0;
12989 bool LHSFromHi = false;
12990 if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi))
12991 return SDValue();
12992
12993 SDValue RHS;
12994 uint32_t ShiftRHS = 0;
12995 bool RHSFromHi = false;
12996 if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi))
12997 return SDValue();
12998
12999 // If they're both trying to come from the high part of the register, they're
13000 // not really an EXTR.
13001 if (LHSFromHi == RHSFromHi)
13002 return SDValue();
13003
13004 if (ShiftLHS + ShiftRHS != VT.getSizeInBits())
13005 return SDValue();
13006
13007 if (LHSFromHi) {
13008 std::swap(LHS, RHS);
13010 }
13011
13012 return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS,
13014}
13015
13018 EVT VT = N->getValueType(0);
13019 SelectionDAG &DAG = DCI.DAG;
13020 SDLoc DL(N);
13021
13022 if (!VT.isVector())
13023 return SDValue();
13024
13025 // The combining code currently only works for NEON vectors. In particular,
13026 // it does not work for SVE when dealing with vectors wider than 128 bits.
13027 if (!VT.is64BitVector() && !VT.is128BitVector())
13028 return SDValue();
13029
13030 SDValue N0 = N->getOperand(0);
13031 if (N0.getOpcode() != ISD::AND)
13032 return SDValue();
13033
13034 SDValue N1 = N->getOperand(1);
13035 if (N1.getOpcode() != ISD::AND)
13036 return SDValue();
13037
13038 // InstCombine does (not (neg a)) => (add a -1).
13039 // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
13040 // Loop over all combinations of AND operands.
13041 for (int i = 1; i >= 0; --i) {
13042 for (int j = 1; j >= 0; --j) {
13043 SDValue O0 = N0->getOperand(i);
13044 SDValue O1 = N1->getOperand(j);
13046
13047 // Find a SUB and an ADD operand, one from each AND.
13048 if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
13049 Sub = O0;
13050 Add = O1;
13051 SubSibling = N0->getOperand(1 - i);
13052 AddSibling = N1->getOperand(1 - j);
13053 } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
13054 Add = O0;
13055 Sub = O1;
13056 AddSibling = N0->getOperand(1 - i);
13057 SubSibling = N1->getOperand(1 - j);
13058 } else
13059 continue;
13060
13062 continue;
13063
13064 // Constant ones is always righthand operand of the Add.
13065 if (!ISD::isBuildVectorAllOnes(Add.getOperand(1).getNode()))
13066 continue;
13067
13068 if (Sub.getOperand(1) != Add.getOperand(0))
13069 continue;
13070
13071 return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);
13072 }
13073 }
13074
13075 // (or (and a b) (and (not a) c)) => (bsl a b c)
13076 // We only have to look for constant vectors here since the general, variable
13077 // case can be handled in TableGen.
13078 unsigned Bits = VT.getScalarSizeInBits();
13079 uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
13080 for (int i = 1; i >= 0; --i)
13081 for (int j = 1; j >= 0; --j) {
13084 if (!BVN0 || !BVN1)
13085 continue;
13086
13087 bool FoundMatch = true;
13088 for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
13091 if (!CN0 || !CN1 ||
13092 CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
13093 FoundMatch = false;
13094 break;
13095 }
13096 }
13097
13098 if (FoundMatch)
13099 return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(BVN0, 0),
13100 N0->getOperand(1 - i), N1->getOperand(1 - j));
13101 }
13102
13103 return SDValue();
13104}
13105
13107 const AArch64Subtarget *Subtarget) {
13108 // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
13109 SelectionDAG &DAG = DCI.DAG;
13110 EVT VT = N->getValueType(0);
13111
13112 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
13113 return SDValue();
13114
13115 if (SDValue Res = tryCombineToEXTR(N, DCI))
13116 return Res;
13117
13118 if (SDValue Res = tryCombineToBSL(N, DCI))
13119 return Res;
13120
13121 return SDValue();
13122}
13123
13125 if (!MemVT.getVectorElementType().isSimple())
13126 return false;
13127
13128 uint64_t MaskForTy = 0ull;
13129 switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
13130 case MVT::i8:
13131 MaskForTy = 0xffull;
13132 break;
13133 case MVT::i16:
13134 MaskForTy = 0xffffull;
13135 break;
13136 case MVT::i32:
13137 MaskForTy = 0xffffffffull;
13138 break;
13139 default:
13140 return false;
13141 break;
13142 }
13143
13144 if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR)
13145 if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0)))
13146 return Op0->getAPIntValue().getLimitedValue() == MaskForTy;
13147
13148 return false;
13149}
13150
13153 if (DCI.isBeforeLegalizeOps())
13154 return SDValue();
13155
13156 SelectionDAG &DAG = DCI.DAG;
13157 SDValue Src = N->getOperand(0);
13158 unsigned Opc = Src->getOpcode();
13159
13160 // Zero/any extend of an unsigned unpack
13161 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
13162 SDValue UnpkOp = Src->getOperand(0);
13163 SDValue Dup = N->getOperand(1);
13164
13165 if (Dup.getOpcode() != AArch64ISD::DUP)
13166 return SDValue();
13167
13168 SDLoc DL(N);
13169 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Dup->getOperand(0));
13170 uint64_t ExtVal = C->getZExtValue();
13171
13172 // If the mask is fully covered by the unpack, we don't need to push
13173 // a new AND onto the operand
13174 EVT EltTy = UnpkOp->getValueType(0).getVectorElementType();
13175 if ((ExtVal == 0xFF && EltTy == MVT::i8) ||
13176 (ExtVal == 0xFFFF && EltTy == MVT::i16) ||
13177 (ExtVal == 0xFFFFFFFF && EltTy == MVT::i32))
13178 return Src;
13179
13180 // Truncate to prevent a DUP with an over wide constant
13181 APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits());
13182
13183 // Otherwise, make sure we propagate the AND to the operand
13184 // of the unpack
13186 UnpkOp->getValueType(0),
13187 DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));
13188
13189 SDValue And = DAG.getNode(ISD::AND, DL,
13190 UnpkOp->getValueType(0), UnpkOp, Dup);
13191
13192 return DAG.getNode(Opc, DL, N->getValueType(0), And);
13193 }
13194
13196 return SDValue();
13197
13198 SDValue Mask = N->getOperand(1);
13199
13200 if (!Src.hasOneUse())
13201 return SDValue();
13202
13203 EVT MemVT;
13204
13205 // SVE load instructions perform an implicit zero-extend, which makes them
13206 // perfect candidates for combining.
13207 switch (Opc) {
13211 MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
13212 break;
13228 MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
13229 break;
13230 default:
13231 return SDValue();
13232 }
13233
13234 if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))
13235 return Src;
13236
13237 return SDValue();
13238}
13239
13242 SelectionDAG &DAG = DCI.DAG;
13243 SDValue LHS = N->getOperand(0);
13244 EVT VT = N->getValueType(0);
13245 if (!VT.isVector() || !DAG.getTargetLoweringInfo().isTypeLegal(VT))
13246 return SDValue();
13247
13248 if (VT.isScalableVector())
13249 return performSVEAndCombine(N, DCI);
13250
13251 // The combining code below works only for NEON vectors. In particular, it
13252 // does not work for SVE when dealing with vectors wider than 128 bits.
13253 if (!(VT.is64BitVector() || VT.is128BitVector()))
13254 return SDValue();
13255
13257 dyn_cast<BuildVectorSDNode>(N->getOperand(1).getNode());
13258 if (!BVN)
13259 return SDValue();
13260
13261 // AND does not accept an immediate, so check if we can use a BIC immediate
13262 // instruction instead. We do this here instead of using a (and x, (mvni imm))
13263 // pattern in isel, because some immediates may be lowered to the preferred
13264 // (and x, (movi imm)) form, even though an mvni representation also exists.
13265 APInt DefBits(VT.getSizeInBits(), 0);
13266 APInt UndefBits(VT.getSizeInBits(), 0);
13268 SDValue NewOp;
13269
13270 DefBits = ~DefBits;
13272 DefBits, &LHS)) ||
13274 DefBits, &LHS)))
13275 return NewOp;
13276
13279 UndefBits, &LHS)) ||
13281 UndefBits, &LHS)))
13282 return NewOp;
13283 }
13284
13285 return SDValue();
13286}
13287
13290 SelectionDAG &DAG = DCI.DAG;
13291 EVT VT = N->getValueType(0);
13292 if (VT != MVT::i32 && VT != MVT::i64)
13293 return SDValue();
13294
13295 // Canonicalize (srl (bswap i32 x), 16) to (rotr (bswap i32 x), 16), if the
13296 // high 16-bits of x are zero. Similarly, canonicalize (srl (bswap i64 x), 32)
13297 // to (rotr (bswap i64 x), 32), if the high 32-bits of x are zero.
13298 SDValue N0 = N->getOperand(0);
13299 if (N0.getOpcode() == ISD::BSWAP) {
13300 SDLoc DL(N);
13301 SDValue N1 = N->getOperand(1);
13302 SDValue N00 = N0.getOperand(0);
13304 uint64_t ShiftAmt = C->getZExtValue();
13305 if (VT == MVT::i32 && ShiftAmt == 16 &&
13307 return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);
13308 if (VT == MVT::i64 && ShiftAmt == 32 &&
13310 return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);
13311 }
13312 }
13313 return SDValue();
13314}
13315
13316// Attempt to form urhadd(OpA, OpB) from
13317// truncate(vlshr(sub(zext(OpB), xor(zext(OpA), Ones(ElemSizeInBits))), 1))
13318// or uhadd(OpA, OpB) from truncate(vlshr(add(zext(OpA), zext(OpB)), 1)).
13319// The original form of the first expression is
13320// truncate(srl(add(zext(OpB), add(zext(OpA), 1)), 1)) and the
13321// (OpA + OpB + 1) subexpression will have been changed to (OpB - (~OpA)).
13322// Before this function is called the srl will have been lowered to
13323// AArch64ISD::VLSHR.
13324// This pass can also recognize signed variants of the patterns that use sign
13325// extension instead of zero extension and form a srhadd(OpA, OpB) or a
13326// shadd(OpA, OpB) from them.
13327static SDValue
13329 SelectionDAG &DAG) {
13330 EVT VT = N->getValueType(0);
13331
13332 // Since we are looking for a right shift by a constant value of 1 and we are
13333 // operating on types at least 16 bits in length (sign/zero extended OpA and
13334 // OpB, which are at least 8 bits), it follows that the truncate will always
13335 // discard the shifted-in bit and therefore the right shift will be logical
13336 // regardless of the signedness of OpA and OpB.
13337 SDValue Shift = N->getOperand(0);
13338 if (Shift.getOpcode() != AArch64ISD::VLSHR)
13339 return SDValue();
13340
13341 // Is the right shift using an immediate value of 1?
13342 uint64_t ShiftAmount = Shift.getConstantOperandVal(1);
13343 if (ShiftAmount != 1)
13344 return SDValue();
13345
13347 SDValue ShiftOp0 = Shift.getOperand(0);
13348 unsigned ShiftOp0Opc = ShiftOp0.getOpcode();
13349 if (ShiftOp0Opc == ISD::SUB) {
13350
13351 SDValue Xor = ShiftOp0.getOperand(1);
13352 if (Xor.getOpcode() != ISD::XOR)
13353 return SDValue();
13354
13355 // Is the XOR using a constant amount of all ones in the right hand side?
13356 uint64_t C;
13357 if (!isAllConstantBuildVector(Xor.getOperand(1), C))
13358 return SDValue();
13359
13360 unsigned ElemSizeInBits = VT.getScalarSizeInBits();
13363 return SDValue();
13364
13365 ExtendOpA = Xor.getOperand(0);
13366 ExtendOpB = ShiftOp0.getOperand(0);
13367 } else if (ShiftOp0Opc == ISD::ADD) {
13368 ExtendOpA = ShiftOp0.getOperand(0);
13369 ExtendOpB = ShiftOp0.getOperand(1);
13370 } else
13371 return SDValue();
13372
13373 unsigned ExtendOpAOpc = ExtendOpA.getOpcode();
13374 unsigned ExtendOpBOpc = ExtendOpB.getOpcode();
13375 if (!(ExtendOpAOpc == ExtendOpBOpc &&
13377 return SDValue();
13378
13379 // Is the result of the right shift being truncated to the same value type as
13380 // the original operands, OpA and OpB?
13381 SDValue OpA = ExtendOpA.getOperand(0);
13382 SDValue OpB = ExtendOpB.getOperand(0);
13383 EVT OpAVT = OpA.getValueType();
13384 assert(ExtendOpA.getValueType() == ExtendOpB.getValueType());
13385 if (!(VT == OpAVT && OpAVT == OpB.getValueType()))
13386 return SDValue();
13387
13388 SDLoc DL(N);
13390 bool IsRHADD = ShiftOp0Opc == ISD::SUB;
13391 unsigned HADDOpc = IsSignExtend
13394 SDValue ResultHADD = DAG.getNode(HADDOpc, DL, VT, OpA, OpB);
13395
13396 return ResultHADD;
13397}
13398
13399static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
13400 switch (Opcode) {
13401 case ISD::FADD:
13402 return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
13403 case ISD::ADD:
13404 return VT == MVT::i64;
13405 default:
13406 return false;
13407 }
13408}
13409
13411 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
13413
13414 EVT VT = N->getValueType(0);
13415 const bool FullFP16 =
13416 static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
13417
13418 // Rewrite for pairwise fadd pattern
13419 // (f32 (extract_vector_elt
13420 // (fadd (vXf32 Other)
13421 // (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))
13422 // ->
13423 // (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
13424 // (extract_vector_elt (vXf32 Other) 1))
13425 if (ConstantN1 && ConstantN1->getZExtValue() == 0 &&
13426 hasPairwiseAdd(N0->getOpcode(), VT, FullFP16)) {
13427 SDLoc DL(N0);
13428 SDValue N00 = N0->getOperand(0);
13429 SDValue N01 = N0->getOperand(1);
13430
13432 SDValue Other = N00;
13433
13434 // And handle the commutative case.
13435 if (!Shuffle) {
13437 Other = N01;
13438 }
13439
13440 if (Shuffle && Shuffle->getMaskElt(0) == 1 &&
13441 Other == Shuffle->getOperand(0)) {
13442 return DAG.getNode(N0->getOpcode(), DL, VT,
13444 DAG.getConstant(0, DL, MVT::i64)),
13446 DAG.getConstant(1, DL, MVT::i64)));
13447 }
13448 }
13449
13450 return SDValue();
13451}
13452
13455 SelectionDAG &DAG) {
13456 SDLoc dl(N);
13457 EVT VT = N->getValueType(0);
13458 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
13459 unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();
13460
13461 // Optimize concat_vectors of truncated vectors, where the intermediate
13462 // type is illegal, to avoid said illegality, e.g.,
13463 // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
13464 // (v2i16 (truncate (v2i64)))))
13465 // ->
13466 // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
13467 // (v4i32 (bitcast (v2i64))),
13468 // <0, 2, 4, 6>)))
13469 // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
13470 // on both input and result type, so we might generate worse code.
13471 // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
13472 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
13473 N1Opc == ISD::TRUNCATE) {
13474 SDValue N00 = N0->getOperand(0);
13475 SDValue N10 = N1->getOperand(0);
13476 EVT N00VT = N00.getValueType();
13477
13478 if (N00VT == N10.getValueType() &&
13479 (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
13480 N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
13482 SmallVector<int, 8> Mask(MidVT.getVectorNumElements());
13483 for (size_t i = 0; i < Mask.size(); ++i)
13484 Mask[i] = i * 2;
13485 return DAG.getNode(ISD::TRUNCATE, dl, VT,
13486 DAG.getVectorShuffle(
13487 MidVT, dl,
13488 DAG.getNode(ISD::BITCAST, dl, MidVT, N00),
13489 DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask));
13490 }
13491 }
13492
13493 // Wait 'til after everything is legalized to try this. That way we have
13494 // legal vector types and such.
13495 if (DCI.isBeforeLegalizeOps())
13496 return SDValue();
13497
13498 // Optimise concat_vectors of two [us]rhadds or [us]hadds that use extracted
13499 // subvectors from the same original vectors. Combine these into a single
13500 // [us]rhadd or [us]hadd that operates on the two original vectors. Example:
13501 // (v16i8 (concat_vectors (v8i8 (urhadd (extract_subvector (v16i8 OpA, <0>),
13502 // extract_subvector (v16i8 OpB,
13503 // <0>))),
13504 // (v8i8 (urhadd (extract_subvector (v16i8 OpA, <8>),
13505 // extract_subvector (v16i8 OpB,
13506 // <8>)))))
13507 // ->
13508 // (v16i8(urhadd(v16i8 OpA, v16i8 OpB)))
13509 if (N->getNumOperands() == 2 && N0Opc == N1Opc &&
13512 SDValue N00 = N0->getOperand(0);
13513 SDValue N01 = N0->getOperand(1);
13514 SDValue N10 = N1->getOperand(0);
13515 SDValue N11 = N1->getOperand(1);
13516
13517 EVT N00VT = N00.getValueType();
13518 EVT N10VT = N10.getValueType();
13519
13520 if (N00->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
13521 N01->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
13522 N10->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
13523 N11->getOpcode() == ISD::EXTRACT_SUBVECTOR && N00VT == N10VT) {
13524 SDValue N00Source = N00->getOperand(0);
13525 SDValue N01Source = N01->getOperand(0);
13526 SDValue N10Source = N10->getOperand(0);
13527 SDValue N11Source = N11->getOperand(0);
13528
13529 if (N00Source == N10Source && N01Source == N11Source &&
13530 N00Source.getValueType() == VT && N01Source.getValueType() == VT) {
13531 assert(N0.getValueType() == N1.getValueType());
13532
13533 uint64_t N00Index = N00.getConstantOperandVal(1);
13534 uint64_t N01Index = N01.getConstantOperandVal(1);
13535 uint64_t N10Index = N10.getConstantOperandVal(1);
13536 uint64_t N11Index = N11.getConstantOperandVal(1);
13537
13538 if (N00Index == N01Index && N10Index == N11Index && N00Index == 0 &&
13539 N10Index == N00VT.getVectorNumElements())
13540 return DAG.getNode(N0Opc, dl, VT, N00Source, N01Source);
13541 }
13542 }
13543 }
13544
13545 // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
13546 // splat. The indexed instructions are going to be expecting a DUPLANE64, so
13547 // canonicalise to that.
13548 if (N0 == N1 && VT.getVectorNumElements() == 2) {
13549 assert(VT.getScalarSizeInBits() == 64);
13550 return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
13551 DAG.getConstant(0, dl, MVT::i64));
13552 }
13553
13554 // Canonicalise concat_vectors so that the right-hand vector has as few
13555 // bit-casts as possible before its real operation. The primary matching
13556 // destination for these operations will be the narrowing "2" instructions,
13557 // which depend on the operation being performed on this right-hand vector.
13558 // For example,
13559 // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
13560 // becomes
13561 // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
13562
13563 if (N1Opc != ISD::BITCAST)
13564 return SDValue();
13565 SDValue RHS = N1->getOperand(0);
13566 MVT RHSTy = RHS.getValueType().getSimpleVT();
13567 // If the RHS is not a vector, this is not the pattern we're looking for.
13568 if (!RHSTy.isVector())
13569 return SDValue();
13570
13571 LLVM_DEBUG(
13572 dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
13573
13574 MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
13575 RHSTy.getVectorNumElements() * 2);
13576 return DAG.getNode(ISD::BITCAST, dl, VT,
13578 DAG.getNode(ISD::BITCAST, dl, RHSTy, N0),
13579 RHS));
13580}
13581
13584 SelectionDAG &DAG) {
13585 // Wait until after everything is legalized to try this. That way we have
13586 // legal vector types and such.
13587 if (DCI.isBeforeLegalizeOps())
13588 return SDValue();
13589 // Transform a scalar conversion of a value from a lane extract into a
13590 // lane extract of a vector conversion. E.g., from foo1 to foo2:
13591 // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
13592 // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
13593 //
13594 // The second form interacts better with instruction selection and the
13595 // register allocator to avoid cross-class register copies that aren't
13596 // coalescable due to a lane reference.
13597
13598 // Check the operand and see if it originates from a lane extract.
13599 SDValue Op1 = N->getOperand(1);
13600 if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
13601 // Yep, no additional predication needed. Perform the transform.
13602 SDValue IID = N->getOperand(0);
13603 SDValue Shift = N->getOperand(2);
13604 SDValue Vec = Op1.getOperand(0);
13605 SDValue Lane = Op1.getOperand(1);
13606 EVT ResTy = N->getValueType(0);
13607 EVT VecResTy;
13608 SDLoc DL(N);
13609
13610 // The vector width should be 128 bits by the time we get here, even
13611 // if it started as 64 bits (the extract_vector handling will have
13612 // done so).
13613 assert(Vec.getValueSizeInBits() == 128 &&
13614 "unexpected vector size on extract_vector_elt!");
13615 if (Vec.getValueType() == MVT::v4i32)
13617 else if (Vec.getValueType() == MVT::v2i64)
13619 else
13620 llvm_unreachable("unexpected vector type!");
13621
13622 SDValue Convert =
13623 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
13624 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
13625 }
13626 return SDValue();
13627}
13628
13629// AArch64 high-vector "long" operations are formed by performing the non-high
13630// version on an extract_subvector of each operand which gets the high half:
13631//
13632// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
13633//
13634// However, there are cases which don't have an extract_high explicitly, but
13635// have another operation that can be made compatible with one for free. For
13636// example:
13637//
13638// (dupv64 scalar) --> (extract_high (dup128 scalar))
13639//
13640// This routine does the actual conversion of such DUPs, once outer routines
13641// have determined that everything else is in order.
13642// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
13643// similarly here.
13645 switch (N.getOpcode()) {
13646 case AArch64ISD::DUP:
13651 case AArch64ISD::MOVI:
13657 break;
13658 default:
13659 // FMOV could be supported, but isn't very useful, as it would only occur
13660 // if you passed a bitcast' floating point immediate to an eligible long
13661 // integer op (addl, smull, ...).
13662 return SDValue();
13663 }
13664
13665 MVT NarrowTy = N.getSimpleValueType();
13666 if (!NarrowTy.is64BitVector())
13667 return SDValue();
13668
13669 MVT ElementTy = NarrowTy.getVectorElementType();
13670 unsigned NumElems = NarrowTy.getVectorNumElements();
13671 MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
13672
13673 SDLoc dl(N);
13674 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NarrowTy,
13675 DAG.getNode(N->getOpcode(), dl, NewVT, N->ops()),
13676 DAG.getConstant(NumElems, dl, MVT::i64));
13677}
13678
13680 if (N.getOpcode() == ISD::BITCAST)
13681 N = N.getOperand(0);
13682 if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
13683 return false;
13684 if (N.getOperand(0).getValueType().isScalableVector())
13685 return false;
13686 return cast<ConstantSDNode>(N.getOperand(1))->getAPIntValue() ==
13687 N.getOperand(0).getValueType().getVectorNumElements() / 2;
13688}
13689
13690/// Helper structure to keep track of ISD::SET_CC operands.
13696
13697/// Helper structure to keep track of a SET_CC lowered into AArch64 code.
13702
13703/// Helper structure to keep track of SetCC information.
13708
13709/// Helper structure to be able to read SetCC information. If set to
13710/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
13711/// GenericSetCCInfo.
13716
13717/// Check whether or not \p Op is a SET_CC operation, either a generic or
13718/// an
13719/// AArch64 lowered one.
13720/// \p SetCCInfo is filled accordingly.
13721/// \post SetCCInfo is meanginfull only when this function returns true.
13722/// \return True when Op is a kind of SET_CC operation.
13724 // If this is a setcc, this is straight forward.
13725 if (Op.getOpcode() == ISD::SETCC) {
13726 SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
13727 SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
13728 SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
13729 SetCCInfo.IsAArch64 = false;
13730 return true;
13731 }
13732 // Otherwise, check if this is a matching csel instruction.
13733 // In other words:
13734 // - csel 1, 0, cc
13735 // - csel 0, 1, !cc
13736 if (Op.getOpcode() != AArch64ISD::CSEL)
13737 return false;
13738 // Set the information about the operands.
13739 // TODO: we want the operands of the Cmp not the csel
13740 SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
13741 SetCCInfo.IsAArch64 = true;
13742 SetCCInfo.Info.AArch64.CC = static_cast<AArch64CC::CondCode>(
13743 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
13744
13745 // Check that the operands matches the constraints:
13746 // (1) Both operands must be constants.
13747 // (2) One must be 1 and the other must be 0.
13748 ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
13749 ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
13750
13751 // Check (1).
13752 if (!TValue || !FValue)
13753 return false;
13754
13755 // Check (2).
13756 if (!TValue->isOne()) {
13757 // Update the comparison when we are interested in !cc.
13759 SetCCInfo.Info.AArch64.CC =
13761 }
13762 return TValue->isOne() && FValue->isNullValue();
13763}
13764
13765// Returns true if Op is setcc or zext of setcc.
13766static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
13767 if (isSetCC(Op, Info))
13768 return true;
13769 return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
13770 isSetCC(Op->getOperand(0), Info));
13771}
13772
13773// The folding we want to perform is:
13774// (add x, [zext] (setcc cc ...) )
13775// -->
13776// (csel x, (add x, 1), !cc ...)
13777//
13778// The latter will get matched to a CSINC instruction.
13780 assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
13781 SDValue LHS = Op->getOperand(0);
13782 SDValue RHS = Op->getOperand(1);
13784
13785 // If both operands are a SET_CC, then we don't want to perform this
13786 // folding and create another csel as this results in more instructions
13787 // (and higher register usage).
13788 if (isSetCCOrZExtSetCC(LHS, InfoAndKind) &&
13790 return SDValue();
13791
13792 // If neither operand is a SET_CC, give up.
13793 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
13794 std::swap(LHS, RHS);
13796 return SDValue();
13797 }
13798
13799 // FIXME: This could be generatized to work for FP comparisons.
13800 EVT CmpVT = InfoAndKind.IsAArch64
13801 ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
13802 : InfoAndKind.Info.Generic.Opnd0->getValueType();
13803 if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
13804 return SDValue();
13805
13806 SDValue CCVal;
13807 SDValue Cmp;
13808 SDLoc dl(Op);
13809 if (InfoAndKind.IsAArch64) {
13810 CCVal = DAG.getConstant(
13811 AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), dl,
13812 MVT::i32);
13813 Cmp = *InfoAndKind.Info.AArch64.Cmp;
13814 } else
13815 Cmp = getAArch64Cmp(
13816 *InfoAndKind.Info.Generic.Opnd0, *InfoAndKind.Info.Generic.Opnd1,
13817 ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, CmpVT), CCVal, DAG,
13818 dl);
13819
13820 EVT VT = Op->getValueType(0);
13821 LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT));
13822 return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
13823}
13824
13825// ADD(UADDV a, UADDV b) --> UADDV(ADD a, b)
13827 EVT VT = N->getValueType(0);
13828 // Only scalar integer and vector types.
13829 if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger())
13830 return SDValue();
13831
13832 SDValue LHS = N->getOperand(0);
13833 SDValue RHS = N->getOperand(1);
13834 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13835 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT)
13836 return SDValue();
13837
13838 auto *LHSN1 = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
13839 auto *RHSN1 = dyn_cast<ConstantSDNode>(RHS->getOperand(1));
13840 if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isNullValue())
13841 return SDValue();
13842
13843 SDValue Op1 = LHS->getOperand(0);
13844 SDValue Op2 = RHS->getOperand(0);
13845 EVT OpVT1 = Op1.getValueType();
13846 EVT OpVT2 = Op2.getValueType();
13847 if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 ||
13848 Op2.getOpcode() != AArch64ISD::UADDV ||
13849 OpVT1.getVectorElementType() != VT)
13850 return SDValue();
13851
13852 SDValue Val1 = Op1.getOperand(0);
13853 SDValue Val2 = Op2.getOperand(0);
13854 EVT ValVT = Val1->getValueType(0);
13855 SDLoc DL(N);
13856 SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2);
13857 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
13858 DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal),
13859 DAG.getConstant(0, DL, MVT::i64));
13860}
13861
13862// ADD(UDOT(zero, x, y), A) --> UDOT(A, x, y)
13864 EVT VT = N->getValueType(0);
13865 if (N->getOpcode() != ISD::ADD)
13866 return SDValue();
13867
13868 SDValue Dot = N->getOperand(0);
13869 SDValue A = N->getOperand(1);
13870 // Handle commutivity
13871 auto isZeroDot = [](SDValue Dot) {
13872 return (Dot.getOpcode() == AArch64ISD::UDOT ||
13873 Dot.getOpcode() == AArch64ISD::SDOT) &&
13875 };
13876 if (!isZeroDot(Dot))
13877 std::swap(Dot, A);
13878 if (!isZeroDot(Dot))
13879 return SDValue();
13880
13881 return DAG.getNode(Dot.getOpcode(), SDLoc(N), VT, A, Dot.getOperand(1),
13882 Dot.getOperand(2));
13883}
13884
13885// The basic add/sub long vector instructions have variants with "2" on the end
13886// which act on the high-half of their inputs. They are normally matched by
13887// patterns like:
13888//
13889// (add (zeroext (extract_high LHS)),
13890// (zeroext (extract_high RHS)))
13891// -> uaddl2 vD, vN, vM
13892//
13893// However, if one of the extracts is something like a duplicate, this
13894// instruction can still be used profitably. This function puts the DAG into a
13895// more appropriate form for those patterns to trigger.
13898 SelectionDAG &DAG) {
13899 if (DCI.isBeforeLegalizeOps())
13900 return SDValue();
13901
13902 MVT VT = N->getSimpleValueType(0);
13903 if (!VT.is128BitVector()) {
13904 if (N->getOpcode() == ISD::ADD)
13905 return performSetccAddFolding(N, DAG);
13906 return SDValue();
13907 }
13908
13909 // Make sure both branches are extended in the same way.
13910 SDValue LHS = N->getOperand(0);
13911 SDValue RHS = N->getOperand(1);
13912 if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
13913 LHS.getOpcode() != ISD::SIGN_EXTEND) ||
13914 LHS.getOpcode() != RHS.getOpcode())
13915 return SDValue();
13916
13917 unsigned ExtType = LHS.getOpcode();
13918
13919 // It's not worth doing if at least one of the inputs isn't already an
13920 // extract, but we don't know which it'll be so we have to try both.
13921 if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) {
13922 RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
13923 if (!RHS.getNode())
13924 return SDValue();
13925
13926 RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
13927 } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {
13928 LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
13929 if (!LHS.getNode())
13930 return SDValue();
13931
13932 LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
13933 }
13934
13935 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
13936}
13937
13940 SelectionDAG &DAG) {
13941 // Try to change sum of two reductions.
13942 if (SDValue Val = performUADDVCombine(N, DAG))
13943 return Val;
13944 if (SDValue Val = performAddDotCombine(N, DAG))
13945 return Val;
13946
13947 return performAddSubLongCombine(N, DCI, DAG);
13948}
13949
13950// Massage DAGs which we can use the high-half "long" operations on into
13951// something isel will recognize better. E.g.
13952//
13953// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
13954// (aarch64_neon_umull (extract_high (v2i64 vec)))
13955// (extract_high (v2i64 (dup128 scalar)))))
13956//
13959 SelectionDAG &DAG) {
13960 if (DCI.isBeforeLegalizeOps())
13961 return SDValue();
13962
13963 SDValue LHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 0 : 1);
13964 SDValue RHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 1 : 2);
13965 assert(LHS.getValueType().is64BitVector() &&
13966 RHS.getValueType().is64BitVector() &&
13967 "unexpected shape for long operation");
13968
13969 // Either node could be a DUP, but it's not worth doing both of them (you'd
13970 // just as well use the non-high version) so look for a corresponding extract
13971 // operation on the other "wing".
13973 RHS = tryExtendDUPToExtractHigh(RHS, DAG);
13974 if (!RHS.getNode())
13975 return SDValue();
13976 } else if (isEssentiallyExtractHighSubvector(RHS)) {
13977 LHS = tryExtendDUPToExtractHigh(LHS, DAG);
13978 if (!LHS.getNode())
13979 return SDValue();
13980 }
13981
13982 if (IID == Intrinsic::not_intrinsic)
13983 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), LHS, RHS);
13984
13985 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
13986 N->getOperand(0), LHS, RHS);
13987}
13988
13989static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
13990 MVT ElemTy = N->getSimpleValueType(0).getScalarType();
13991 unsigned ElemBits = ElemTy.getSizeInBits();
13992
13993 int64_t ShiftAmount;
13994 if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
13995 APInt SplatValue, SplatUndef;
13996 unsigned SplatBitSize;
13997 bool HasAnyUndefs;
13998 if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
14000 SplatBitSize != ElemBits)
14001 return SDValue();
14002
14003 ShiftAmount = SplatValue.getSExtValue();
14004 } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
14005 ShiftAmount = CVN->getSExtValue();
14006 } else
14007 return SDValue();
14008
14009 unsigned Opcode;
14010 bool IsRightShift;
14011 switch (IID) {
14012 default:
14013 llvm_unreachable("Unknown shift intrinsic");
14014 case Intrinsic::aarch64_neon_sqshl:
14015 Opcode = AArch64ISD::SQSHL_I;
14016 IsRightShift = false;
14017 break;
14018 case Intrinsic::aarch64_neon_uqshl:
14019 Opcode = AArch64ISD::UQSHL_I;
14020 IsRightShift = false;
14021 break;
14022 case Intrinsic::aarch64_neon_srshl:
14023 Opcode = AArch64ISD::SRSHR_I;
14024 IsRightShift = true;
14025 break;
14026 case Intrinsic::aarch64_neon_urshl:
14027 Opcode = AArch64ISD::URSHR_I;
14028 IsRightShift = true;
14029 break;
14030 case Intrinsic::aarch64_neon_sqshlu:
14031 Opcode = AArch64ISD::SQSHLU_I;
14032 IsRightShift = false;
14033 break;
14034 case Intrinsic::aarch64_neon_sshl:
14035 case Intrinsic::aarch64_neon_ushl:
14036 // For positive shift amounts we can use SHL, as ushl/sshl perform a regular
14037 // left shift for positive shift amounts. Below, we only replace the current
14038 // node with VSHL, if this condition is met.
14039 Opcode = AArch64ISD::VSHL;
14040 IsRightShift = false;
14041 break;
14042 }
14043
14045 SDLoc dl(N);
14046 return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1),
14047 DAG.getConstant(-ShiftAmount, dl, MVT::i32));
14048 } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
14049 SDLoc dl(N);
14050 return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1),
14051 DAG.getConstant(ShiftAmount, dl, MVT::i32));
14052 }
14053
14054 return SDValue();
14055}
14056
14057// The CRC32[BH] instructions ignore the high bits of their data operand. Since
14058// the intrinsics must be legal and take an i32, this means there's almost
14059// certainly going to be a zext in the DAG which we can eliminate.
14060static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
14061 SDValue AndN = N->getOperand(2);
14062 if (AndN.getOpcode() != ISD::AND)
14063 return SDValue();
14064
14066 if (!CMask || CMask->getZExtValue() != Mask)
14067 return SDValue();
14068
14070 N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
14071}
14072
14074 SelectionDAG &DAG) {
14075 SDLoc dl(N);
14076 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0),
14077 DAG.getNode(Opc, dl,
14078 N->getOperand(1).getSimpleValueType(),
14079 N->getOperand(1)),
14080 DAG.getConstant(0, dl, MVT::i64));
14081}
14082
14084 SDLoc DL(N);
14085 SDValue Op1 = N->getOperand(1);
14086 SDValue Op2 = N->getOperand(2);
14087 EVT ScalarTy = Op2.getValueType();
14088 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
14089 ScalarTy = MVT::i32;
14090
14091 // Lower index_vector(base, step) to mul(step step_vector(1)) + splat(base).
14092 SDValue StepVector = DAG.getStepVector(DL, N->getValueType(0));
14093 SDValue Step = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op2);
14094 SDValue Mul = DAG.getNode(ISD::MUL, DL, N->getValueType(0), StepVector, Step);
14095 SDValue Base = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op1);
14096 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), Mul, Base);
14097}
14098
14100 SDLoc dl(N);
14101 SDValue Scalar = N->getOperand(3);
14102 EVT ScalarTy = Scalar.getValueType();
14103
14104 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
14105 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
14106
14107 SDValue Passthru = N->getOperand(1);
14108 SDValue Pred = N->getOperand(2);
14109 return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, dl, N->getValueType(0),
14110 Pred, Scalar, Passthru);
14111}
14112
14114 SDLoc dl(N);
14115 LLVMContext &Ctx = *DAG.getContext();
14116 EVT VT = N->getValueType(0);
14117
14118 assert(VT.isScalableVector() && "Expected a scalable vector.");
14119
14120 // Current lowering only supports the SVE-ACLE types.
14122 return SDValue();
14123
14124 unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8;
14125 unsigned ByteSize = VT.getSizeInBits().getKnownMinSize() / 8;
14126 EVT ByteVT =
14128
14129 // Convert everything to the domain of EXT (i.e bytes).
14130 SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(1));
14131 SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(2));
14132 SDValue Op2 = DAG.getNode(ISD::MUL, dl, MVT::i32, N->getOperand(3),
14133 DAG.getConstant(ElemSize, dl, MVT::i32));
14134
14135 SDValue EXT = DAG.getNode(AArch64ISD::EXT, dl, ByteVT, Op0, Op1, Op2);
14136 return DAG.getNode(ISD::BITCAST, dl, VT, EXT);
14137}
14138
14141 SelectionDAG &DAG) {
14142 if (DCI.isBeforeLegalize())
14143 return SDValue();
14144
14145 SDValue Comparator = N->getOperand(3);
14146 if (Comparator.getOpcode() == AArch64ISD::DUP ||
14147 Comparator.getOpcode() == ISD::SPLAT_VECTOR) {
14148 unsigned IID = getIntrinsicID(N);
14149 EVT VT = N->getValueType(0);
14150 EVT CmpVT = N->getOperand(2).getValueType();
14151 SDValue Pred = N->getOperand(1);
14152 SDValue Imm;
14153 SDLoc DL(N);
14154
14155 switch (IID) {
14156 default:
14157 llvm_unreachable("Called with wrong intrinsic!");
14158 break;
14159
14160 // Signed comparisons
14161 case Intrinsic::aarch64_sve_cmpeq_wide:
14162 case Intrinsic::aarch64_sve_cmpne_wide:
14163 case Intrinsic::aarch64_sve_cmpge_wide:
14164 case Intrinsic::aarch64_sve_cmpgt_wide:
14165 case Intrinsic::aarch64_sve_cmplt_wide:
14166 case Intrinsic::aarch64_sve_cmple_wide: {
14167 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
14168 int64_t ImmVal = CN->getSExtValue();
14169 if (ImmVal >= -16 && ImmVal <= 15)
14170 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
14171 else
14172 return SDValue();
14173 }
14174 break;
14175 }
14176 // Unsigned comparisons
14177 case Intrinsic::aarch64_sve_cmphs_wide:
14178 case Intrinsic::aarch64_sve_cmphi_wide:
14179 case Intrinsic::aarch64_sve_cmplo_wide:
14180 case Intrinsic::aarch64_sve_cmpls_wide: {
14181 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
14182 uint64_t ImmVal = CN->getZExtValue();
14183 if (ImmVal <= 127)
14184 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
14185 else
14186 return SDValue();
14187 }
14188 break;
14189 }
14190 }
14191
14192 if (!Imm)
14193 return SDValue();
14194
14195 SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm);
14196 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred,
14197 N->getOperand(2), Splat, DAG.getCondCode(CC));
14198 }
14199
14200 return SDValue();
14201}
14202
14205 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14206
14207 SDLoc DL(Op);
14208 assert(Op.getValueType().isScalableVector() &&
14209 TLI.isTypeLegal(Op.getValueType()) &&
14210 "Expected legal scalable vector type!");
14211
14212 // Ensure target specific opcodes are using legal type.
14213 EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
14214 SDValue TVal = DAG.getConstant(1, DL, OutVT);
14215 SDValue FVal = DAG.getConstant(0, DL, OutVT);
14216
14217 // Set condition code (CC) flags.
14219
14220 // Convert CC to integer based on requested condition.
14221 // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
14222 SDValue CC = DAG.getConstant(getInvertedCondCode(Cond), DL, MVT::i32);
14223 SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test);
14224 return DAG.getZExtOrTrunc(Res, DL, VT);
14225}
14226
14228 SelectionDAG &DAG) {
14229 SDLoc DL(N);
14230
14231 SDValue Pred = N->getOperand(1);
14232 SDValue VecToReduce = N->getOperand(2);
14233
14234 // NOTE: The integer reduction's result type is not always linked to the
14235 // operand's element type so we construct it from the intrinsic's result type.
14236 EVT ReduceVT = getPackedSVEVectorVT(N->getValueType(0));
14237 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
14238
14239 // SVE reductions set the whole vector register with the first element
14240 // containing the reduction result, which we'll now extract.
14241 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
14242 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
14243 Zero);
14244}
14245
14247 SelectionDAG &DAG) {
14248 SDLoc DL(N);
14249
14250 SDValue Pred = N->getOperand(1);
14251 SDValue VecToReduce = N->getOperand(2);
14252
14253 EVT ReduceVT = VecToReduce.getValueType();
14254 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
14255
14256 // SVE reductions set the whole vector register with the first element
14257 // containing the reduction result, which we'll now extract.
14258 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
14259 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
14260 Zero);
14261}
14262
14264 SelectionDAG &DAG) {
14265 SDLoc DL(N);
14266
14267 SDValue Pred = N->getOperand(1);
14268 SDValue InitVal = N->getOperand(2);
14269 SDValue VecToReduce = N->getOperand(3);
14270 EVT ReduceVT = VecToReduce.getValueType();
14271
14272 // Ordered reductions use the first lane of the result vector as the
14273 // reduction's initial value.
14274 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
14276 DAG.getUNDEF(ReduceVT), InitVal, Zero);
14277
14278 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce);
14279
14280 // SVE reductions set the whole vector register with the first element
14281 // containing the reduction result, which we'll now extract.
14282 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
14283 Zero);
14284}
14285
14287 unsigned NumElts = N.getValueType().getVectorMinNumElements();
14288
14289 // Look through cast.
14290 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
14291 N = N.getOperand(0);
14292 // When reinterpreting from a type with fewer elements the "new" elements
14293 // are not active, so bail if they're likely to be used.
14294 if (N.getValueType().getVectorMinNumElements() < NumElts)
14295 return false;
14296 }
14297
14298 // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
14299 // or smaller than the implicit element type represented by N.
14300 // NOTE: A larger element count implies a smaller element type.
14301 if (N.getOpcode() == AArch64ISD::PTRUE &&
14302 N.getConstantOperandVal(0) == AArch64SVEPredPattern::all)
14303 return N.getValueType().getVectorMinNumElements() >= NumElts;
14304
14305 return false;
14306}
14307
14308// If a merged operation has no inactive lanes we can relax it to a predicated
14309// or unpredicated operation, which potentially allows better isel (perhaps
14310// using immediate forms) or relaxing register reuse requirements.
14312 SelectionDAG &DAG,
14313 bool UnpredOp = false) {
14314 assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");
14315 assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!");
14316 SDValue Pg = N->getOperand(1);
14317
14318 // ISD way to specify an all active predicate.
14319 if (isAllActivePredicate(Pg)) {
14320 if (UnpredOp)
14321 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), N->getOperand(2),
14322 N->getOperand(3));
14323 else
14324 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Pg,
14325 N->getOperand(2), N->getOperand(3));
14326 }
14327
14328 // FUTURE: SplatVector(true)
14329 return SDValue();
14330}
14331
14334 const AArch64Subtarget *Subtarget) {
14335 SelectionDAG &DAG = DCI.DAG;
14336 unsigned IID = getIntrinsicID(N);
14337 switch (IID) {
14338 default:
14339 break;
14340 case Intrinsic::aarch64_neon_vcvtfxs2fp:
14341 case Intrinsic::aarch64_neon_vcvtfxu2fp:
14342 return tryCombineFixedPointConvert(N, DCI, DAG);
14343 case Intrinsic::aarch64_neon_saddv:
14345 case Intrinsic::aarch64_neon_uaddv:
14347 case Intrinsic::aarch64_neon_sminv:
14349 case Intrinsic::aarch64_neon_uminv:
14351 case Intrinsic::aarch64_neon_smaxv:
14353 case Intrinsic::aarch64_neon_umaxv:
14355 case Intrinsic::aarch64_neon_fmax:
14356 return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0),
14357 N->getOperand(1), N->getOperand(2));
14358 case Intrinsic::aarch64_neon_fmin:
14359 return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0),
14360 N->getOperand(1), N->getOperand(2));
14361 case Intrinsic::aarch64_neon_fmaxnm:
14362 return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
14363 N->getOperand(1), N->getOperand(2));
14364 case Intrinsic::aarch64_neon_fminnm:
14365 return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
14366 N->getOperand(1), N->getOperand(2));
14367 case Intrinsic::aarch64_neon_smull:
14368 case Intrinsic::aarch64_neon_umull:
14369 case Intrinsic::aarch64_neon_pmull:
14370 case Intrinsic::aarch64_neon_sqdmull:
14371 return tryCombineLongOpWithDup(IID, N, DCI, DAG);
14372 case Intrinsic::aarch64_neon_sqshl:
14373 case Intrinsic::aarch64_neon_uqshl:
14374 case Intrinsic::aarch64_neon_sqshlu:
14375 case Intrinsic::aarch64_neon_srshl:
14376 case Intrinsic::aarch64_neon_urshl:
14377 case Intrinsic::aarch64_neon_sshl:
14378 case Intrinsic::aarch64_neon_ushl:
14379 return tryCombineShiftImm(IID, N, DAG);
14380 case Intrinsic::aarch64_crc32b:
14381 case Intrinsic::aarch64_crc32cb:
14382 return tryCombineCRC32(0xff, N, DAG);
14383 case Intrinsic::aarch64_crc32h:
14384 case Intrinsic::aarch64_crc32ch:
14385 return tryCombineCRC32(0xffff, N, DAG);
14386 case Intrinsic::aarch64_sve_saddv:
14387 // There is no i64 version of SADDV because the sign is irrelevant.
14388 if (N->getOperand(2)->getValueType(0).getVectorElementType() == MVT::i64)
14390 else
14392 case Intrinsic::aarch64_sve_uaddv:
14394 case Intrinsic::aarch64_sve_smaxv:
14396 case Intrinsic::aarch64_sve_umaxv:
14398 case Intrinsic::aarch64_sve_sminv:
14400 case Intrinsic::aarch64_sve_uminv:
14402 case Intrinsic::aarch64_sve_orv:
14404 case Intrinsic::aarch64_sve_eorv:
14406 case Intrinsic::aarch64_sve_andv:
14408 case Intrinsic::aarch64_sve_index:
14409 return LowerSVEIntrinsicIndex(N, DAG);
14410 case Intrinsic::aarch64_sve_dup:
14411 return LowerSVEIntrinsicDUP(N, DAG);
14412 case Intrinsic::aarch64_sve_dup_x:
14413 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0),
14414 N->getOperand(1));
14415 case Intrinsic::aarch64_sve_ext:
14416 return LowerSVEIntrinsicEXT(N, DAG);
14417 case Intrinsic::aarch64_sve_mul:
14419 case Intrinsic::aarch64_sve_smulh:
14421 case Intrinsic::aarch64_sve_umulh:
14423 case Intrinsic::aarch64_sve_smin:
14425 case Intrinsic::aarch64_sve_umin:
14427 case Intrinsic::aarch64_sve_smax:
14429 case Intrinsic::aarch64_sve_umax:
14431 case Intrinsic::aarch64_sve_lsl:
14433 case Intrinsic::aarch64_sve_lsr:
14435 case Intrinsic::aarch64_sve_asr:
14437 case Intrinsic::aarch64_sve_fadd:
14439 case Intrinsic::aarch64_sve_fsub:
14441 case Intrinsic::aarch64_sve_fmul:
14443 case Intrinsic::aarch64_sve_add:
14444 return convertMergedOpToPredOp(N, ISD::ADD, DAG, true);
14445 case Intrinsic::aarch64_sve_sub:
14446 return convertMergedOpToPredOp(N, ISD::SUB, DAG, true);
14447 case Intrinsic::aarch64_sve_and:
14448 return convertMergedOpToPredOp(N, ISD::AND, DAG, true);
14449 case Intrinsic::aarch64_sve_bic:
14450 return convertMergedOpToPredOp(N, AArch64ISD::BIC, DAG, true);
14451 case Intrinsic::aarch64_sve_eor:
14452 return convertMergedOpToPredOp(N, ISD::XOR, DAG, true);
14453 case Intrinsic::aarch64_sve_orr:
14454 return convertMergedOpToPredOp(N, ISD::OR, DAG, true);
14455 case Intrinsic::aarch64_sve_sqadd:
14456 return convertMergedOpToPredOp(N, ISD::SADDSAT, DAG, true);
14457 case Intrinsic::aarch64_sve_sqsub:
14458 return convertMergedOpToPredOp(N, ISD::SSUBSAT, DAG, true);
14459 case Intrinsic::aarch64_sve_uqadd:
14460 return convertMergedOpToPredOp(N, ISD::UADDSAT, DAG, true);
14461 case Intrinsic::aarch64_sve_uqsub:
14462 return convertMergedOpToPredOp(N, ISD::USUBSAT, DAG, true);
14463 case Intrinsic::aarch64_sve_sqadd_x:
14464 return DAG.getNode(ISD::SADDSAT, SDLoc(N), N->getValueType(0),
14465 N->getOperand(1), N->getOperand(2));
14466 case Intrinsic::aarch64_sve_sqsub_x:
14467 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
14468 N->getOperand(1), N->getOperand(2));
14469 case Intrinsic::aarch64_sve_uqadd_x:
14470 return DAG.getNode(ISD::UADDSAT, SDLoc(N), N->getValueType(0),
14471 N->getOperand(1), N->getOperand(2));
14472 case Intrinsic::aarch64_sve_uqsub_x:
14473 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
14474 N->getOperand(1), N->getOperand(2));
14475 case Intrinsic::aarch64_sve_cmphs:
14476 if (!N->getOperand(2).getValueType().isFloatingPoint())
14478 N->getValueType(0), N->getOperand(1), N->getOperand(2),
14479 N->getOperand(3), DAG.getCondCode(ISD::SETUGE));
14480 break;
14481 case Intrinsic::aarch64_sve_cmphi:
14482 if (!N->getOperand(2).getValueType().isFloatingPoint())
14484 N->getValueType(0), N->getOperand(1), N->getOperand(2),
14485 N->getOperand(3), DAG.getCondCode(ISD::SETUGT));
14486 break;
14487 case Intrinsic::aarch64_sve_fcmpge:
14488 case Intrinsic::aarch64_sve_cmpge:
14490 N->getValueType(0), N->getOperand(1), N->getOperand(2),
14491 N->getOperand(3), DAG.getCondCode(ISD::SETGE));
14492 break;
14493 case Intrinsic::aarch64_sve_fcmpgt:
14494 case Intrinsic::aarch64_sve_cmpgt:
14496 N->getValueType(0), N->getOperand(1), N->getOperand(2),
14497 N->getOperand(3), DAG.getCondCode(ISD::SETGT));
14498 break;
14499 case Intrinsic::aarch64_sve_fcmpeq:
14500 case Intrinsic::aarch64_sve_cmpeq:
14502 N->getValueType(0), N->getOperand(1), N->getOperand(2),
14503 N->getOperand(3), DAG.getCondCode(ISD::SETEQ));
14504 break;
14505 case Intrinsic::aarch64_sve_fcmpne:
14506 case Intrinsic::aarch64_sve_cmpne:
14508 N->getValueType(0), N->getOperand(1), N->getOperand(2),
14509 N->getOperand(3), DAG.getCondCode(ISD::SETNE));
14510 break;
14511 case Intrinsic::aarch64_sve_fcmpuo:
14513 N->getValueType(0), N->getOperand(1), N->getOperand(2),
14514 N->getOperand(3), DAG.getCondCode(ISD::SETUO));
14515 break;
14516 case Intrinsic::aarch64_sve_fadda:
14518 case Intrinsic::aarch64_sve_faddv:
14520 case Intrinsic::aarch64_sve_fmaxnmv:
14522 case Intrinsic::aarch64_sve_fmaxv:
14524 case Intrinsic::aarch64_sve_fminnmv:
14526 case Intrinsic::aarch64_sve_fminv:
14528 case Intrinsic::aarch64_sve_sel:
14529 return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0),
14530 N->getOperand(1), N->getOperand(2), N->getOperand(3));
14531 case Intrinsic::aarch64_sve_cmpeq_wide:
14533 case Intrinsic::aarch64_sve_cmpne_wide:
14535 case Intrinsic::aarch64_sve_cmpge_wide:
14537 case Intrinsic::aarch64_sve_cmpgt_wide:
14539 case Intrinsic::aarch64_sve_cmplt_wide:
14541 case Intrinsic::aarch64_sve_cmple_wide:
14543 case Intrinsic::aarch64_sve_cmphs_wide:
14545 case Intrinsic::aarch64_sve_cmphi_wide:
14547 case Intrinsic::aarch64_sve_cmplo_wide:
14549 case Intrinsic::aarch64_sve_cmpls_wide:
14551 case Intrinsic::aarch64_sve_ptest_any:
14552 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
14554 case Intrinsic::aarch64_sve_ptest_first:
14555 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
14557 case Intrinsic::aarch64_sve_ptest_last:
14558 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
14560 }
14561 return SDValue();
14562}
14563
14566 SelectionDAG &DAG) {
14567 // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
14568 // we can convert that DUP into another extract_high (of a bigger DUP), which
14569 // helps the backend to decide that an sabdl2 would be useful, saving a real
14570 // extract_high operation.
14571 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
14572 (N->getOperand(0).getOpcode() == ISD::ABDU ||
14573 N->getOperand(0).getOpcode() == ISD::ABDS)) {
14574 SDNode *ABDNode = N->getOperand(0).getNode();
14575 SDValue NewABD =
14577 if (!NewABD.getNode())
14578 return SDValue();
14579
14580 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD);
14581 }
14582 return SDValue();
14583}
14584
14586 SDValue SplatVal, unsigned NumVecElts) {
14587 assert(!St.isTruncatingStore() && "cannot split truncating vector store");
14588 unsigned OrigAlignment = St.getAlignment();
14589 unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
14590
14591 // Create scalar stores. This is at least as good as the code sequence for a
14592 // split unaligned store which is a dup.s, ext.b, and two stores.
14593 // Most of the time the three stores should be replaced by store pair
14594 // instructions (stp).
14595 SDLoc DL(&St);
14596 SDValue BasePtr = St.getBasePtr();
14597 uint64_t BaseOffset = 0;
14598
14599 const MachinePointerInfo &PtrInfo = St.getPointerInfo();
14600 SDValue NewST1 =
14601 DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
14602 OrigAlignment, St.getMemOperand()->getFlags());
14603
14604 // As this in ISel, we will not merge this add which may degrade results.
14605 if (BasePtr->getOpcode() == ISD::ADD &&
14606 isa<ConstantSDNode>(BasePtr->getOperand(1))) {
14607 BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
14608 BasePtr = BasePtr->getOperand(0);
14609 }
14610
14611 unsigned Offset = EltOffset;
14612 while (--NumVecElts) {
14613 unsigned Alignment = MinAlign(OrigAlignment, Offset);
14614 SDValue OffsetPtr =
14615 DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
14616 DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
14617 NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
14618 PtrInfo.getWithOffset(Offset), Alignment,
14619 St.getMemOperand()->getFlags());
14620 Offset += EltOffset;
14621 }
14622 return NewST1;
14623}
14624
14625// Returns an SVE type that ContentTy can be trivially sign or zero extended
14626// into.
14628 assert(ContentTy.isSimple() && "No SVE containers for extended types");
14629
14630 switch (ContentTy.getSimpleVT().SimpleTy) {
14631 default:
14632 llvm_unreachable("No known SVE container for this MVT type");
14633 case MVT::nxv2i8:
14634 case MVT::nxv2i16:
14635 case MVT::nxv2i32:
14636 case MVT::nxv2i64:
14637 case MVT::nxv2f32:
14638 case MVT::nxv2f64:
14639 return MVT::nxv2i64;
14640 case MVT::nxv4i8:
14641 case MVT::nxv4i16:
14642 case MVT::nxv4i32:
14643 case MVT::nxv4f32:
14644 return MVT::nxv4i32;
14645 case MVT::nxv8i8:
14646 case MVT::nxv8i16:
14647 case MVT::nxv8f16:
14648 case MVT::nxv8bf16:
14649 return MVT::nxv8i16;
14650 case MVT::nxv16i8:
14651 return MVT::nxv16i8;
14652 }
14653}
14654
14655static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) {
14656 SDLoc DL(N);
14657 EVT VT = N->getValueType(0);
14658
14660 return SDValue();
14661
14662 EVT ContainerVT = VT;
14663 if (ContainerVT.isInteger())
14665
14667 SDValue Ops[] = { N->getOperand(0), // Chain
14668 N->getOperand(2), // Pg
14669 N->getOperand(3), // Base
14670 DAG.getValueType(VT) };
14671
14672 SDValue Load = DAG.getNode(Opc, DL, VTs, Ops);
14673 SDValue LoadChain = SDValue(Load.getNode(), 1);
14674
14675 if (ContainerVT.isInteger() && (VT != ContainerVT))
14676 Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0));
14677
14678 return DAG.getMergeValues({ Load, LoadChain }, DL);
14679}
14680
14682 SDLoc DL(N);
14683 EVT VT = N->getValueType(0);
14684 EVT PtrTy = N->getOperand(3).getValueType();
14685
14686 if (VT == MVT::nxv8bf16 &&
14687 !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
14688 return SDValue();
14689
14690 EVT LoadVT = VT;
14691 if (VT.isFloatingPoint())
14693
14696 SDValue L = DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(),
14697 MINode->getOperand(3), DAG.getUNDEF(PtrTy),
14698 MINode->getOperand(2), PassThru,
14699 MINode->getMemoryVT(), MINode->getMemOperand(),
14701
14702 if (VT.isFloatingPoint()) {
14703 SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) };
14704 return DAG.getMergeValues(Ops, DL);
14705 }
14706
14707 return L;
14708}
14709
14710template <unsigned Opcode>
14712 static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO ||
14714 "Unsupported opcode.");
14715 SDLoc DL(N);
14716 EVT VT = N->getValueType(0);
14717 if (VT == MVT::nxv8bf16 &&
14718 !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
14719 return SDValue();
14720
14721 EVT LoadVT = VT;
14722 if (VT.isFloatingPoint())
14724
14725 SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)};
14726 SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops);
14727 SDValue LoadChain = SDValue(Load.getNode(), 1);
14728
14729 if (VT.isFloatingPoint())
14730 Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0));
14731
14732 return DAG.getMergeValues({Load, LoadChain}, DL);
14733}
14734
14736 SDLoc DL(N);
14737 SDValue Data = N->getOperand(2);
14738 EVT DataVT = Data.getValueType();
14741
14742 if (DataVT == MVT::nxv8bf16 &&
14743 !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
14744 return SDValue();
14745
14746 if (DataVT.isFloatingPoint())
14748
14750 if (Data.getValueType().isFloatingPoint())
14752 else
14754
14755 SDValue Ops[] = { N->getOperand(0), // Chain
14756 SrcNew,
14757 N->getOperand(4), // Base
14758 N->getOperand(3), // Pg
14759 InputVT
14760 };
14761
14762 return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops);
14763}
14764
14766 SDLoc DL(N);
14767
14768 SDValue Data = N->getOperand(2);
14769 EVT DataVT = Data.getValueType();
14770 EVT PtrTy = N->getOperand(4).getValueType();
14771
14772 if (DataVT == MVT::nxv8bf16 &&
14773 !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
14774 return SDValue();
14775
14776 if (DataVT.isFloatingPoint())
14777 Data = DAG.getNode(ISD::BITCAST, DL, DataVT.changeTypeToInteger(), Data);
14778
14780 return DAG.getMaskedStore(MINode->getChain(), DL, Data, MINode->getOperand(4),
14781 DAG.getUNDEF(PtrTy), MINode->getOperand(3),
14782 MINode->getMemoryVT(), MINode->getMemOperand(),
14783 ISD::UNINDEXED, false, false);
14784}
14785
14786/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
14787/// load store optimizer pass will merge them to store pair stores. This should
14788/// be better than a movi to create the vector zero followed by a vector store
14789/// if the zero constant is not re-used, since one instructions and one register
14790/// live range will be removed.
14791///
14792/// For example, the final generated code should be:
14793///
14794/// stp xzr, xzr, [x0]
14795///
14796/// instead of:
14797///
14798/// movi v0.2d, #0
14799/// str q0, [x0]
14800///
14802 SDValue StVal = St.getValue();
14803 EVT VT = StVal.getValueType();
14804
14805 // Avoid scalarizing zero splat stores for scalable vectors.
14806 if (VT.isScalableVector())
14807 return SDValue();
14808
14809 // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
14810 // 2, 3 or 4 i32 elements.
14812 if (!(((NumVecElts == 2 || NumVecElts == 3) &&
14813 VT.getVectorElementType().getSizeInBits() == 64) ||
14814 ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
14815 VT.getVectorElementType().getSizeInBits() == 32)))
14816 return SDValue();
14817
14818 if (StVal.getOpcode() != ISD::BUILD_VECTOR)
14819 return SDValue();
14820
14821 // If the zero constant has more than one use then the vector store could be
14822 // better since the constant mov will be amortized and stp q instructions
14823 // should be able to be formed.
14824 if (!StVal.hasOneUse())
14825 return SDValue();
14826
14827 // If the store is truncating then it's going down to i16 or smaller, which
14828 // means it can be implemented in a single store anyway.
14829 if (St.isTruncatingStore())
14830 return SDValue();
14831
14832 // If the immediate offset of the address operand is too large for the stp
14833 // instruction, then bail out.
14834 if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
14835 int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
14837 return SDValue();
14838 }
14839
14840 for (int I = 0; I < NumVecElts; ++I) {
14841 SDValue EltVal = StVal.getOperand(I);
14843 return SDValue();
14844 }
14845
14846 // Use a CopyFromReg WZR/XZR here to prevent
14847 // DAGCombiner::MergeConsecutiveStores from undoing this transformation.
14848 SDLoc DL(&St);
14849 unsigned ZeroReg;
14850 EVT ZeroVT;
14851 if (VT.getVectorElementType().getSizeInBits() == 32) {
14852 ZeroReg = AArch64::WZR;
14853 ZeroVT = MVT::i32;
14854 } else {
14855 ZeroReg = AArch64::XZR;
14856 ZeroVT = MVT::i64;
14857 }
14860 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
14861}
14862
14863/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
14864/// value. The load store optimizer pass will merge them to store pair stores.
14865/// This has better performance than a splat of the scalar followed by a split
14866/// vector store. Even if the stores are not merged it is four stores vs a dup,
14867/// followed by an ext.b and two stores.
14869 SDValue StVal = St.getValue();
14870 EVT VT = StVal.getValueType();
14871
14872 // Don't replace floating point stores, they possibly won't be transformed to
14873 // stp because of the store pair suppress pass.
14874 if (VT.isFloatingPoint())
14875 return SDValue();
14876
14877 // We can express a splat as store pair(s) for 2 or 4 elements.
14878 unsigned NumVecElts = VT.getVectorNumElements();
14879 if (NumVecElts != 4 && NumVecElts != 2)
14880 return SDValue();
14881
14882 // If the store is truncating then it's going down to i16 or smaller, which
14883 // means it can be implemented in a single store anyway.
14884 if (St.isTruncatingStore())
14885 return SDValue();
14886
14887 // Check that this is a splat.
14888 // Make sure that each of the relevant vector element locations are inserted
14889 // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
14890 std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
14892 for (unsigned I = 0; I < NumVecElts; ++I) {
14893 // Check for insert vector elements.
14894 if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
14895 return SDValue();
14896
14897 // Check that same value is inserted at each vector element.
14898 if (I == 0)
14899 SplatVal = StVal.getOperand(1);
14900 else if (StVal.getOperand(1) != SplatVal)
14901 return SDValue();
14902
14903 // Check insert element index.
14905 if (!CIndex)
14906 return SDValue();
14907 uint64_t IndexVal = CIndex->getZExtValue();
14908 if (IndexVal >= NumVecElts)
14909 return SDValue();
14911
14912 StVal = StVal.getOperand(0);
14913 }
14914 // Check that all vector element locations were inserted to.
14915 if (IndexNotInserted.any())
14916 return SDValue();
14917
14918 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
14919}
14920
14922 SelectionDAG &DAG,
14923 const AArch64Subtarget *Subtarget) {
14924
14926 if (S->isVolatile() || S->isIndexed())
14927 return SDValue();
14928
14929 SDValue StVal = S->getValue();
14930 EVT VT = StVal.getValueType();
14931
14932 if (!VT.isFixedLengthVector())
14933 return SDValue();
14934
14935 // If we get a splat of zeros, convert this vector store to a store of
14936 // scalars. They will be merged into store pairs of xzr thereby removing one
14937 // instruction and one register.
14939 return ReplacedZeroSplat;
14940
14941 // FIXME: The logic for deciding if an unaligned store should be split should
14942 // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
14943 // a call to that function here.
14944
14945 if (!Subtarget->isMisaligned128StoreSlow())
14946 return SDValue();
14947
14948 // Don't split at -Oz.
14950 return SDValue();
14951
14952 // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
14953 // those up regresses performance on micro-benchmarks and olden/bh.
14954 if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
14955 return SDValue();
14956
14957 // Split unaligned 16B stores. They are terrible for performance.
14958 // Don't split stores with alignment of 1 or 2. Code that uses clang vector
14959 // extensions can use this to mark that it does not want splitting to happen
14960 // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
14961 // eliminating alignment hazards is only 1 in 8 for alignment of 2.
14962 if (VT.getSizeInBits() != 128 || S->getAlignment() >= 16 ||
14963 S->getAlignment() <= 2)
14964 return SDValue();
14965
14966 // If we get a splat of a scalar convert this vector store to a store of
14967 // scalars. They will be merged into store pairs thereby removing two
14968 // instructions.
14970 return ReplacedSplat;
14971
14972 SDLoc DL(S);
14973
14974 // Split VT into two.
14976 unsigned NumElts = HalfVT.getVectorNumElements();
14978 DAG.getConstant(0, DL, MVT::i64));
14981 SDValue BasePtr = S->getBasePtr();
14982 SDValue NewST1 =
14983 DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
14984 S->getAlignment(), S->getMemOperand()->getFlags());
14985 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
14986 DAG.getConstant(8, DL, MVT::i64));
14987 return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
14988 S->getPointerInfo(), S->getAlignment(),
14989 S->getMemOperand()->getFlags());
14990}
14991
14993 assert(N->getOpcode() == AArch64ISD::SPLICE && "Unexepected Opcode!");
14994
14995 // splice(pg, op1, undef) -> op1
14996 if (N->getOperand(2).isUndef())
14997 return N->getOperand(1);
14998
14999 return SDValue();
15000}
15001
15003 SDLoc DL(N);
15004 SDValue Op0 = N->getOperand(0);
15005 SDValue Op1 = N->getOperand(1);
15006 EVT ResVT = N->getValueType(0);
15007
15008 // uzp1(unpklo(uzp1(x, y)), z) => uzp1(x, z)
15009 if (Op0.getOpcode() == AArch64ISD::UUNPKLO) {
15010 if (Op0.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
15011 SDValue X = Op0.getOperand(0).getOperand(0);
15012 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1);
15013 }
15014 }
15015
15016 // uzp1(x, unpkhi(uzp1(y, z))) => uzp1(x, z)
15017 if (Op1.getOpcode() == AArch64ISD::UUNPKHI) {
15018 if (Op1.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
15019 SDValue Z = Op1.getOperand(0).getOperand(1);
15020 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z);
15021 }
15022 }
15023
15024 return SDValue();
15025}
15026
15028 unsigned Opc = N->getOpcode();
15029
15030 assert(((Opc >= AArch64ISD::GLD1_MERGE_ZERO && // unsigned gather loads
15032 (Opc >= AArch64ISD::GLD1S_MERGE_ZERO && // signed gather loads
15034 "Invalid opcode.");
15035
15036 const bool Scaled = Opc == AArch64ISD::GLD1_SCALED_MERGE_ZERO ||
15038 const bool Signed = Opc == AArch64ISD::GLD1S_MERGE_ZERO ||
15040 const bool Extended = Opc == AArch64ISD::GLD1_SXTW_MERGE_ZERO ||
15044
15045 SDLoc DL(N);
15046 SDValue Chain = N->getOperand(0);
15047 SDValue Pg = N->getOperand(1);
15048 SDValue Base = N->getOperand(2);
15049 SDValue Offset = N->getOperand(3);
15050 SDValue Ty = N->getOperand(4);
15051
15052 EVT ResVT = N->getValueType(0);
15053
15054 const auto OffsetOpc = Offset.getOpcode();
15055 const bool OffsetIsZExt =
15057 const bool OffsetIsSExt =
15059
15060 // Fold sign/zero extensions of vector offsets into GLD1 nodes where possible.
15061 if (!Extended && (OffsetIsSExt || OffsetIsZExt)) {
15062 SDValue ExtPg = Offset.getOperand(0);
15063 VTSDNode *ExtFrom = cast<VTSDNode>(Offset.getOperand(2).getNode());
15064 EVT ExtFromEVT = ExtFrom->getVT().getVectorElementType();
15065
15066 // If the predicate for the sign- or zero-extended offset is the
15067 // same as the predicate used for this load and the sign-/zero-extension
15068 // was from a 32-bits...
15069 if (ExtPg == Pg && ExtFromEVT == MVT::i32) {
15070 SDValue UnextendedOffset = Offset.getOperand(1);
15071
15072 unsigned NewOpc = getGatherVecOpcode(Scaled, OffsetIsSExt, true);
15073 if (Signed)
15075
15076 return DAG.getNode(NewOpc, DL, {ResVT, MVT::Other},
15077 {Chain, Pg, Base, UnextendedOffset, Ty});
15078 }
15079 }
15080
15081 return SDValue();
15082}
15083
15084/// Optimize a vector shift instruction and its operand if shifted out
15085/// bits are not used.
15087 const AArch64TargetLowering &TLI,
15089 assert(N->getOpcode() == AArch64ISD::VASHR ||
15090 N->getOpcode() == AArch64ISD::VLSHR);
15091
15092 SDValue Op = N->getOperand(0);
15093 unsigned OpScalarSize = Op.getScalarValueSizeInBits();
15094
15095 unsigned ShiftImm = N->getConstantOperandVal(1);
15096 assert(OpScalarSize > ShiftImm && "Invalid shift imm");
15097
15100
15102 return SDValue(N, 0);
15103
15104 return SDValue();
15105}
15106
15107/// Target-specific DAG combine function for post-increment LD1 (lane) and
15108/// post-increment LD1R.
15111 bool IsLaneOp) {
15112 if (DCI.isBeforeLegalizeOps())
15113 return SDValue();
15114
15115 SelectionDAG &DAG = DCI.DAG;
15116 EVT VT = N->getValueType(0);
15117
15118 if (VT.isScalableVector())
15119 return SDValue();
15120
15121 unsigned LoadIdx = IsLaneOp ? 1 : 0;
15122 SDNode *LD = N->getOperand(LoadIdx).getNode();
15123 // If it is not LOAD, can not do such combine.
15124 if (LD->getOpcode() != ISD::LOAD)
15125 return SDValue();
15126
15127 // The vector lane must be a constant in the LD1LANE opcode.
15128 SDValue Lane;
15129 if (IsLaneOp) {
15130 Lane = N->getOperand(2);
15131 auto *LaneC = dyn_cast<ConstantSDNode>(Lane);
15132 if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements())
15133 return SDValue();
15134 }
15135
15137 EVT MemVT = LoadSDN->getMemoryVT();
15138 // Check if memory operand is the same type as the vector element.
15139 if (MemVT != VT.getVectorElementType())
15140 return SDValue();
15141
15142 // Check if there are other uses. If so, do not combine as it will introduce
15143 // an extra load.
15144 for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
15145 ++UI) {
15146 if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
15147 continue;
15148 if (*UI != N)
15149 return SDValue();
15150 }
15151
15152 SDValue Addr = LD->getOperand(1);
15153 SDValue Vector = N->getOperand(0);
15154 // Search for a use of the address operand that is an increment.
15155 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
15156 Addr.getNode()->use_end(); UI != UE; ++UI) {
15157 SDNode *User = *UI;
15158 if (User->getOpcode() != ISD::ADD
15159 || UI.getUse().getResNo() != Addr.getResNo())
15160 continue;
15161
15162 // If the increment is a constant, it must match the memory ref size.
15163 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
15165 uint32_t IncVal = CInc->getZExtValue();
15166 unsigned NumBytes = VT.getScalarSizeInBits() / 8;
15167 if (IncVal != NumBytes)
15168 continue;
15169 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
15170 }
15171
15172 // To avoid cycle construction make sure that neither the load nor the add
15173 // are predecessors to each other or the Vector.
15176 Visited.insert(Addr.getNode());
15177 Worklist.push_back(User);
15178 Worklist.push_back(LD);
15179 Worklist.push_back(Vector.getNode());
15180 if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) ||
15181 SDNode::hasPredecessorHelper(User, Visited, Worklist))
15182 continue;
15183
15185 Ops.push_back(LD->getOperand(0)); // Chain
15186 if (IsLaneOp) {
15187 Ops.push_back(Vector); // The vector to be inserted
15188 Ops.push_back(Lane); // The lane to be inserted in the vector
15189 }
15190 Ops.push_back(Addr);
15191 Ops.push_back(Inc);
15192
15193 EVT Tys[3] = { VT, MVT::i64, MVT::Other };
15194 SDVTList SDTys = DAG.getVTList(Tys);
15197 MemVT,
15198 LoadSDN->getMemOperand());
15199
15200 // Update the uses.
15201 SDValue NewResults[] = {
15202 SDValue(LD, 0), // The result of load
15203 SDValue(UpdN.getNode(), 2) // Chain
15204 };
15205 DCI.CombineTo(LD, NewResults);
15206 DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result
15207 DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register
15208
15209 break;
15210 }
15211 return SDValue();
15212}
15213
15214/// Simplify ``Addr`` given that the top byte of it is ignored by HW during
15215/// address translation.
15218 SelectionDAG &DAG) {
15220 KnownBits Known;
15221 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
15222 !DCI.isBeforeLegalizeOps());
15223 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15224 if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {
15225 DCI.CommitTargetLoweringOpt(TLO);
15226 return true;
15227 }
15228 return false;
15229}
15230
15232 assert((N->getOpcode() == ISD::STORE || N->getOpcode() == ISD::MSTORE) &&
15233 "Expected STORE dag node in input!");
15234
15235 if (auto Store = dyn_cast<StoreSDNode>(N)) {
15236 if (!Store->isTruncatingStore() || Store->isIndexed())
15237 return SDValue();
15238 SDValue Ext = Store->getValue();
15239 auto ExtOpCode = Ext.getOpcode();
15242 return SDValue();
15243 SDValue Orig = Ext->getOperand(0);
15244 if (Store->getMemoryVT() != Orig->getValueType(0))
15245 return SDValue();
15246 return DAG.getStore(Store->getChain(), SDLoc(Store), Orig,
15247 Store->getBasePtr(), Store->getPointerInfo(),
15248 Store->getAlign());
15249 }
15250
15251 return SDValue();
15252}
15253
15256 SelectionDAG &DAG,
15257 const AArch64Subtarget *Subtarget) {
15258 if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
15259 return Split;
15260
15261 if (Subtarget->supportsAddressTopByteIgnored() &&
15262 performTBISimplification(N->getOperand(2), DCI, DAG))
15263 return SDValue(N, 0);
15264
15265 if (SDValue Store = foldTruncStoreOfExt(DAG, N))
15266 return Store;
15267
15268 return SDValue();
15269}
15270
15271/// Target-specific DAG combine function for NEON load/store intrinsics
15272/// to merge base address updates.
15275 SelectionDAG &DAG) {
15276 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
15277 return SDValue();
15278
15279 unsigned AddrOpIdx = N->getNumOperands() - 1;
15280 SDValue Addr = N->getOperand(AddrOpIdx);
15281
15282 // Search for a use of the address operand that is an increment.
15283 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
15284 UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
15285 SDNode *User = *UI;
15286 if (User->getOpcode() != ISD::ADD ||
15287 UI.getUse().getResNo() != Addr.getResNo())
15288 continue;
15289
15290 // Check that the add is independent of the load/store. Otherwise, folding
15291 // it would create a cycle.
15294 Visited.insert(Addr.getNode());
15295 Worklist.push_back(N);
15296 Worklist.push_back(User);
15297 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
15298 SDNode::hasPredecessorHelper(User, Visited, Worklist))
15299 continue;
15300
15301 // Find the new opcode for the updating load/store.
15302 bool IsStore = false;
15303 bool IsLaneOp = false;
15304 bool IsDupOp = false;
15305 unsigned NewOpc = 0;
15306 unsigned NumVecs = 0;
15307 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
15308 switch (IntNo) {
15309 default: llvm_unreachable("unexpected intrinsic for Neon base update");
15310 case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post;
15311 NumVecs = 2; break;
15312 case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post;
15313 NumVecs = 3; break;
15314 case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post;
15315 NumVecs = 4; break;
15316 case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post;
15317 NumVecs = 2; IsStore = true; break;
15318 case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post;
15319 NumVecs = 3; IsStore = true; break;
15320 case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post;
15321 NumVecs = 4; IsStore = true; break;
15322 case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post;
15323 NumVecs = 2; break;
15324 case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post;
15325 NumVecs = 3; break;
15326 case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post;
15327 NumVecs = 4; break;
15328 case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post;
15329 NumVecs = 2; IsStore = true; break;
15330 case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post;
15331 NumVecs = 3; IsStore = true; break;
15332 case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post;
15333 NumVecs = 4; IsStore = true; break;
15334 case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost;
15335 NumVecs = 2; IsDupOp = true; break;
15336 case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost;
15337 NumVecs = 3; IsDupOp = true; break;
15338 case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost;
15339 NumVecs = 4; IsDupOp = true; break;
15340 case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost;
15341 NumVecs = 2; IsLaneOp = true; break;
15342 case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost;
15343 NumVecs = 3; IsLaneOp = true; break;
15344 case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost;
15345 NumVecs = 4; IsLaneOp = true; break;
15346 case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost;
15347 NumVecs = 2; IsStore = true; IsLaneOp = true; break;
15348 case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost;
15349 NumVecs = 3; IsStore = true; IsLaneOp = true; break;
15350 case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost;
15351 NumVecs = 4; IsStore = true; IsLaneOp = true; break;
15352 }
15353
15354 EVT VecTy;
15355 if (IsStore)
15356 VecTy = N->getOperand(2).getValueType();
15357 else
15358 VecTy = N->getValueType(0);
15359
15360 // If the increment is a constant, it must match the memory ref size.
15361 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
15363 uint32_t IncVal = CInc->getZExtValue();
15364 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
15365 if (IsLaneOp || IsDupOp)
15366 NumBytes /= VecTy.getVectorNumElements();
15367 if (IncVal != NumBytes)
15368 continue;
15369 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
15370 }
15372 Ops.push_back(N->getOperand(0)); // Incoming chain
15373 // Load lane and store have vector list as input.
15374 if (IsLaneOp || IsStore)
15375 for (unsigned i = 2; i < AddrOpIdx; ++i)
15376 Ops.push_back(N->getOperand(i));
15377 Ops.push_back(Addr); // Base register
15378 Ops.push_back(Inc);
15379
15380 // Return Types.
15381 EVT Tys[6];
15382 unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
15383 unsigned n;
15384 for (n = 0; n < NumResultVecs; ++n)
15385 Tys[n] = VecTy;
15386 Tys[n++] = MVT::i64; // Type of write back register
15387 Tys[n] = MVT::Other; // Type of the chain
15389
15392 MemInt->getMemoryVT(),
15393 MemInt->getMemOperand());
15394
15395 // Update the uses.
15396 std::vector<SDValue> NewResults;
15397 for (unsigned i = 0; i < NumResultVecs; ++i) {
15398 NewResults.push_back(SDValue(UpdN.getNode(), i));
15399 }
15400 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
15401 DCI.CombineTo(N, NewResults);
15402 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
15403
15404 break;
15405 }
15406 return SDValue();
15407}
15408
15409// Checks to see if the value is the prescribed width and returns information
15410// about its extension mode.
15411static
15412bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
15413 ExtType = ISD::NON_EXTLOAD;
15414 switch(V.getNode()->getOpcode()) {
15415 default:
15416 return false;
15417 case ISD::LOAD: {
15418 LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
15419 if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
15420 || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
15421 ExtType = LoadNode->getExtensionType();
15422 return true;
15423 }
15424 return false;
15425 }
15426 case ISD::AssertSext: {
15427 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
15428 if ((TypeNode->getVT() == MVT::i8 && width == 8)
15429 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
15430 ExtType = ISD::SEXTLOAD;
15431 return true;
15432 }
15433 return false;
15434 }
15435 case ISD::AssertZext: {
15436 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
15437 if ((TypeNode->getVT() == MVT::i8 && width == 8)
15438 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
15439 ExtType = ISD::ZEXTLOAD;
15440 return true;
15441 }
15442 return false;
15443 }
15444 case ISD::Constant:
15445 case ISD::TargetConstant: {
15446 return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
15447 1LL << (width - 1);
15448 }
15449 }
15450
15451 return true;
15452}
15453
15454// This function does a whole lot of voodoo to determine if the tests are
15455// equivalent without and with a mask. Essentially what happens is that given a
15456// DAG resembling:
15457//
15458// +-------------+ +-------------+ +-------------+ +-------------+
15459// | Input | | AddConstant | | CompConstant| | CC |
15460// +-------------+ +-------------+ +-------------+ +-------------+
15461// | | | |
15462// V V | +----------+
15463// +-------------+ +----+ | |
15464// | ADD | |0xff| | |
15465// +-------------+ +----+ | |
15466// | | | |
15467// V V | |
15468// +-------------+ | |
15469// | AND | | |
15470// +-------------+ | |
15471// | | |
15472// +-----+ | |
15473// | | |
15474// V V V
15475// +-------------+
15476// | CMP |
15477// +-------------+
15478//
15479// The AND node may be safely removed for some combinations of inputs. In
15480// particular we need to take into account the extension type of the Input,
15481// the exact values of AddConstant, CompConstant, and CC, along with the nominal
15482// width of the input (this can work for any width inputs, the above graph is
15483// specific to 8 bits.
15484//
15485// The specific equations were worked out by generating output tables for each
15486// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
15487// problem was simplified by working with 4 bit inputs, which means we only
15488// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
15489// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
15490// patterns present in both extensions (0,7). For every distinct set of
15491// AddConstant and CompConstants bit patterns we can consider the masked and
15492// unmasked versions to be equivalent if the result of this function is true for
15493// all 16 distinct bit patterns of for the current extension type of Input (w0).
15494//
15495// sub w8, w0, w1
15496// and w10, w8, #0x0f
15497// cmp w8, w2
15498// cset w9, AArch64CC
15499// cmp w10, w2
15500// cset w11, AArch64CC
15501// cmp w9, w11
15502// cset w0, eq
15503// ret
15504//
15505// Since the above function shows when the outputs are equivalent it defines
15506// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
15507// would be expensive to run during compiles. The equations below were written
15508// in a test harness that confirmed they gave equivalent outputs to the above
15509// for all inputs function, so they can be used determine if the removal is
15510// legal instead.
15511//
15512// isEquivalentMaskless() is the code for testing if the AND can be removed
15513// factored out of the DAG recognition as the DAG can take several forms.
15514
15515static bool isEquivalentMaskless(unsigned CC, unsigned width,
15516 ISD::LoadExtType ExtType, int AddConstant,
15517 int CompConstant) {
15518 // By being careful about our equations and only writing the in term
15519 // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
15520 // make them generally applicable to all bit widths.
15521 int MaxUInt = (1 << width);
15522
15523 // For the purposes of these comparisons sign extending the type is
15524 // equivalent to zero extending the add and displacing it by half the integer
15525 // width. Provided we are careful and make sure our equations are valid over
15526 // the whole range we can just adjust the input and avoid writing equations
15527 // for sign extended inputs.
15528 if (ExtType == ISD::SEXTLOAD)
15529 AddConstant -= (1 << (width-1));
15530
15531 switch(CC) {
15532 case AArch64CC::LE:
15533 case AArch64CC::GT:
15534 if ((AddConstant == 0) ||
15535 (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
15536 (AddConstant >= 0 && CompConstant < 0) ||
15538 return true;
15539 break;
15540 case AArch64CC::LT:
15541 case AArch64CC::GE:
15542 if ((AddConstant == 0) ||
15543 (AddConstant >= 0 && CompConstant <= 0) ||
15544 (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
15545 return true;
15546 break;
15547 case AArch64CC::HI:
15548 case AArch64CC::LS:
15549 if ((AddConstant >= 0 && CompConstant < 0) ||
15552 return true;
15553 break;
15554 case AArch64CC::PL:
15555 case AArch64CC::MI:
15556 if ((AddConstant == 0) ||
15557 (AddConstant > 0 && CompConstant <= 0) ||
15559 return true;
15560 break;
15561 case AArch64CC::LO:
15562 case AArch64CC::HS:
15563 if ((AddConstant >= 0 && CompConstant <= 0) ||
15566 return true;
15567 break;
15568 case AArch64CC::EQ:
15569 case AArch64CC::NE:
15570 if ((AddConstant > 0 && CompConstant < 0) ||
15573 (AddConstant >= 0 && CompConstant >= 0 &&
15576 return true;
15577 break;
15578 case AArch64CC::VS:
15579 case AArch64CC::VC:
15580 case AArch64CC::AL:
15581 case AArch64CC::NV:
15582 return true;
15583 case AArch64CC::Invalid:
15584 break;
15585 }
15586
15587 return false;
15588}
15589
15590static
15593 SelectionDAG &DAG, unsigned CCIndex,
15594 unsigned CmpIndex) {
15595 unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
15596 SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
15597 unsigned CondOpcode = SubsNode->getOpcode();
15598
15600 return SDValue();
15601
15602 // There is a SUBS feeding this condition. Is it fed by a mask we can
15603 // use?
15604
15605 SDNode *AndNode = SubsNode->getOperand(0).getNode();
15606 unsigned MaskBits = 0;
15607
15608 if (AndNode->getOpcode() != ISD::AND)
15609 return SDValue();
15610
15611 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
15612 uint32_t CNV = CN->getZExtValue();
15613 if (CNV == 255)
15614 MaskBits = 8;
15615 else if (CNV == 65535)
15616 MaskBits = 16;
15617 }
15618
15619 if (!MaskBits)
15620 return SDValue();
15621
15622 SDValue AddValue = AndNode->getOperand(0);
15623
15624 if (AddValue.getOpcode() != ISD::ADD)
15625 return SDValue();
15626
15627 // The basic dag structure is correct, grab the inputs and validate them.
15628
15629 SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
15630 SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
15631 SDValue SubsInputValue = SubsNode->getOperand(1);
15632
15633 // The mask is present and the provenance of all the values is a smaller type,
15634 // lets see if the mask is superfluous.
15635
15636 if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
15638 return SDValue();
15639
15640 ISD::LoadExtType ExtType;
15641
15642 if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
15645 return SDValue();
15646
15647 if(!isEquivalentMaskless(CC, MaskBits, ExtType,
15648 cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
15649 cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
15650 return SDValue();
15651
15652 // The AND is not necessary, remove it.
15653
15654 SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
15655 SubsNode->getValueType(1));
15656 SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
15657
15658 SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
15659 DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
15660
15661 return SDValue(N, 0);
15662}
15663
15664// Optimize compare with zero and branch.
15667 SelectionDAG &DAG) {
15669 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
15670 // will not be produced, as they are conditional branch instructions that do
15671 // not set flags.
15672 if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
15673 return SDValue();
15674
15675 if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
15676 N = NV.getNode();
15677 SDValue Chain = N->getOperand(0);
15678 SDValue Dest = N->getOperand(1);
15679 SDValue CCVal = N->getOperand(2);
15680 SDValue Cmp = N->getOperand(3);
15681
15682 assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
15683 unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue();
15684 if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
15685 return SDValue();
15686
15687 unsigned CmpOpc = Cmp.getOpcode();
15689 return SDValue();
15690
15691 // Only attempt folding if there is only one use of the flag and no use of the
15692 // value.
15693 if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
15694 return SDValue();
15695
15696 SDValue LHS = Cmp.getOperand(0);
15697 SDValue RHS = Cmp.getOperand(1);
15698
15699 assert(LHS.getValueType() == RHS.getValueType() &&
15700 "Expected the value type to be the same for both operands!");
15701 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
15702 return SDValue();
15703
15704 if (isNullConstant(LHS))
15705 std::swap(LHS, RHS);
15706
15707 if (!isNullConstant(RHS))
15708 return SDValue();
15709
15710 if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
15711 LHS.getOpcode() == ISD::SRL)
15712 return SDValue();
15713
15714 // Fold the compare into the branch instruction.
15715 SDValue BR;
15716 if (CC == AArch64CC::EQ)
15717 BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
15718 else
15719 BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
15720
15721 // Do not add new nodes to DAG combiner worklist.
15722 DCI.CombineTo(N, BR, false);
15723
15724 return SDValue();
15725}
15726
15727// Optimize CSEL instructions
15730 SelectionDAG &DAG) {
15731 // CSEL x, x, cc -> x
15732 if (N->getOperand(0) == N->getOperand(1))
15733 return N->getOperand(0);
15734
15735 return performCONDCombine(N, DCI, DAG, 2, 3);
15736}
15737
15739 assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!");
15740 SDValue LHS = N->getOperand(0);
15741 SDValue RHS = N->getOperand(1);
15742 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
15743
15744 // setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X
15745 if (Cond == ISD::SETNE && isOneConstant(RHS) &&
15746 LHS->getOpcode() == AArch64ISD::CSEL &&
15747 isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
15748 LHS->hasOneUse()) {
15749 SDLoc DL(N);
15750
15751 // Invert CSEL's condition.
15752 auto *OpCC = cast<ConstantSDNode>(LHS.getOperand(2));
15753 auto OldCond = static_cast<AArch64CC::CondCode>(OpCC->getZExtValue());
15754 auto NewCond = getInvertedCondCode(OldCond);
15755
15756 // csel 0, 1, !cond, X
15757 SDValue CSEL =
15758 DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(), LHS.getOperand(0),
15759 LHS.getOperand(1), DAG.getConstant(NewCond, DL, MVT::i32),
15760 LHS.getOperand(3));
15761 return DAG.getZExtOrTrunc(CSEL, DL, N->getValueType(0));
15762 }
15763
15764 return SDValue();
15765}
15766
15768 assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
15769 "Unexpected opcode!");
15770
15771 SDValue Pred = N->getOperand(0);
15772 SDValue LHS = N->getOperand(1);
15773 SDValue RHS = N->getOperand(2);
15774 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
15775
15776 // setcc_merge_zero pred (sign_extend (setcc_merge_zero ... pred ...)), 0, ne
15777 // => inner setcc_merge_zero
15778 if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&
15779 LHS->getOpcode() == ISD::SIGN_EXTEND &&
15780 LHS->getOperand(0)->getValueType(0) == N->getValueType(0) &&
15781 LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
15782 LHS->getOperand(0)->getOperand(0) == Pred)
15783 return LHS->getOperand(0);
15784
15785 return SDValue();
15786}
15787
15788// Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
15789// as well as whether the test should be inverted. This code is required to
15790// catch these cases (as opposed to standard dag combines) because
15791// AArch64ISD::TBZ is matched during legalization.
15792static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
15793 SelectionDAG &DAG) {
15794
15795 if (!Op->hasOneUse())
15796 return Op;
15797
15798 // We don't handle undef/constant-fold cases below, as they should have
15799 // already been taken care of (e.g. and of 0, test of undefined shifted bits,
15800 // etc.)
15801
15802 // (tbz (trunc x), b) -> (tbz x, b)
15803 // This case is just here to enable more of the below cases to be caught.
15804 if (Op->getOpcode() == ISD::TRUNCATE &&
15805 Bit < Op->getValueType(0).getSizeInBits()) {
15806 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
15807 }
15808
15809 // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
15810 if (Op->getOpcode() == ISD::ANY_EXTEND &&
15811 Bit < Op->getOperand(0).getValueSizeInBits()) {
15812 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
15813 }
15814
15815 if (Op->getNumOperands() != 2)
15816 return Op;
15817
15818 auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
15819 if (!C)
15820 return Op;
15821
15822 switch (Op->getOpcode()) {
15823 default:
15824 return Op;
15825
15826 // (tbz (and x, m), b) -> (tbz x, b)
15827 case ISD::AND:
15828 if ((C->getZExtValue() >> Bit) & 1)
15829 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
15830 return Op;
15831
15832 // (tbz (shl x, c), b) -> (tbz x, b-c)
15833 case ISD::SHL:
15834 if (C->getZExtValue() <= Bit &&
15835 (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
15836 Bit = Bit - C->getZExtValue();
15837 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
15838 }
15839 return Op;
15840
15841 // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
15842 case ISD::SRA:
15843 Bit = Bit + C->getZExtValue();
15844 if (Bit >= Op->getValueType(0).getSizeInBits())
15845 Bit = Op->getValueType(0).getSizeInBits() - 1;
15846 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
15847
15848 // (tbz (srl x, c), b) -> (tbz x, b+c)
15849 case ISD::SRL:
15850 if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
15851 Bit = Bit + C->getZExtValue();
15852 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
15853 }
15854 return Op;
15855
15856 // (tbz (xor x, -1), b) -> (tbnz x, b)
15857 case ISD::XOR:
15858 if ((C->getZExtValue() >> Bit) & 1)
15859 Invert = !Invert;
15860 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
15861 }
15862}
15863
15864// Optimize test single bit zero/non-zero and branch.
15867 SelectionDAG &DAG) {
15868 unsigned Bit = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
15869 bool Invert = false;
15870 SDValue TestSrc = N->getOperand(1);
15872
15873 if (TestSrc == NewTestSrc)
15874 return SDValue();
15875
15876 unsigned NewOpc = N->getOpcode();
15877 if (Invert) {
15878 if (NewOpc == AArch64ISD::TBZ)
15880 else {
15883 }
15884 }
15885
15886 SDLoc DL(N);
15887 return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
15888 DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
15889}
15890
15891// vselect (v1i1 setcc) ->
15892// vselect (v1iXX setcc) (XX is the size of the compared operand type)
15893// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
15894// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
15895// such VSELECT.
15897 SDValue N0 = N->getOperand(0);
15898 EVT CCVT = N0.getValueType();
15899
15900 // Check for sign pattern (VSELECT setgt, iN lhs, -1, 1, -1) and transform
15901 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
15902 // supported types.
15903 SDValue SetCC = N->getOperand(0);
15904 if (SetCC.getOpcode() == ISD::SETCC &&
15905 SetCC.getOperand(2) == DAG.getCondCode(ISD::SETGT)) {
15906 SDValue CmpLHS = SetCC.getOperand(0);
15907 EVT VT = CmpLHS.getValueType();
15908 SDNode *CmpRHS = SetCC.getOperand(1).getNode();
15909 SDNode *SplatLHS = N->getOperand(1).getNode();
15910 SDNode *SplatRHS = N->getOperand(2).getNode();
15912 if (CmpLHS.getValueType() == N->getOperand(1).getValueType() &&
15913 VT.isSimple() &&
15917 VT.getSimpleVT().SimpleTy) &&
15919 SplatLHSVal.isOneValue() && ISD::isConstantSplatVectorAllOnes(CmpRHS) &&
15921 unsigned NumElts = VT.getVectorNumElements();
15924 VT.getScalarType()));
15925 SDValue Val = DAG.getBuildVector(VT, SDLoc(N), Ops);
15926
15927 auto Shift = DAG.getNode(ISD::SRA, SDLoc(N), VT, CmpLHS, Val);
15928 auto Or = DAG.getNode(ISD::OR, SDLoc(N), VT, Shift, N->getOperand(1));
15929 return Or;
15930 }
15931 }
15932
15933 if (N0.getOpcode() != ISD::SETCC ||
15934 CCVT.getVectorElementCount() != ElementCount::getFixed(1) ||
15935 CCVT.getVectorElementType() != MVT::i1)
15936 return SDValue();
15937
15938 EVT ResVT = N->getValueType(0);
15939 EVT CmpVT = N0.getOperand(0).getValueType();
15940 // Only combine when the result type is of the same size as the compared
15941 // operands.
15942 if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
15943 return SDValue();
15944
15945 SDValue IfTrue = N->getOperand(1);
15946 SDValue IfFalse = N->getOperand(2);
15947 SetCC = DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
15948 N0.getOperand(0), N0.getOperand(1),
15949 cast<CondCodeSDNode>(N0.getOperand(2))->get());
15950 return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
15951 IfTrue, IfFalse);
15952}
15953
15954/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
15955/// the compare-mask instructions rather than going via NZCV, even if LHS and
15956/// RHS are really scalar. This replaces any scalar setcc in the above pattern
15957/// with a vector one followed by a DUP shuffle on the result.
15960 SelectionDAG &DAG = DCI.DAG;
15961 SDValue N0 = N->getOperand(0);
15962 EVT ResVT = N->getValueType(0);
15963
15964 if (N0.getOpcode() != ISD::SETCC)
15965 return SDValue();
15966
15967 if (ResVT.isScalableVector())
15968 return SDValue();
15969
15970 // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
15971 // scalar SetCCResultType. We also don't expect vectors, because we assume
15972 // that selects fed by vector SETCCs are canonicalized to VSELECT.
15973 assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) &&
15974 "Scalar-SETCC feeding SELECT has unexpected result type!");
15975
15976 // If NumMaskElts == 0, the comparison is larger than select result. The
15977 // largest real NEON comparison is 64-bits per lane, which means the result is
15978 // at most 32-bits and an illegal vector. Just bail out for now.
15979 EVT SrcVT = N0.getOperand(0).getValueType();
15980
15981 // Don't try to do this optimization when the setcc itself has i1 operands.
15982 // There are no legal vectors of i1, so this would be pointless.
15983 if (SrcVT == MVT::i1)
15984 return SDValue();
15985
15986 int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
15987 if (!ResVT.isVector() || NumMaskElts == 0)
15988 return SDValue();
15989
15991 EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
15992
15993 // Also bail out if the vector CCVT isn't the same size as ResVT.
15994 // This can happen if the SETCC operand size doesn't divide the ResVT size
15995 // (e.g., f64 vs v3f32).
15996 if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
15997 return SDValue();
15998
15999 // Make sure we didn't create illegal types, if we're not supposed to.
16000 assert(DCI.isBeforeLegalize() ||
16002
16003 // First perform a vector comparison, where lane 0 is the one we're interested
16004 // in.
16005 SDLoc DL(N0);
16006 SDValue LHS =
16008 SDValue RHS =
16010 SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
16011
16012 // Now duplicate the comparison mask we want across all other lanes.
16013 SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
16015 Mask = DAG.getNode(ISD::BITCAST, DL,
16016 ResVT.changeVectorElementTypeToInteger(), Mask);
16017
16018 return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
16019}
16020
16021/// Get rid of unnecessary NVCASTs (that don't change the type).
16023 if (N->getValueType(0) == N->getOperand(0).getValueType())
16024 return N->getOperand(0);
16025
16026 return SDValue();
16027}
16028
16029// If all users of the globaladdr are of the form (globaladdr + constant), find
16030// the smallest constant, fold it into the globaladdr's offset and rewrite the
16031// globaladdr as (globaladdr + constant) - constant.
16033 const AArch64Subtarget *Subtarget,
16034 const TargetMachine &TM) {
16036 if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
16038 return SDValue();
16039
16040 uint64_t MinOffset = -1ull;
16041 for (SDNode *N : GN->uses()) {
16042 if (N->getOpcode() != ISD::ADD)
16043 return SDValue();
16044 auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
16045 if (!C)
16046 C = dyn_cast<ConstantSDNode>(N->getOperand(1));
16047 if (!C)
16048 return SDValue();
16049 MinOffset = std::min(MinOffset, C->getZExtValue());
16050 }
16051 uint64_t Offset = MinOffset + GN->getOffset();
16052
16053 // Require that the new offset is larger than the existing one. Otherwise, we
16054 // can end up oscillating between two possible DAGs, for example,
16055 // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
16056 if (Offset <= uint64_t(GN->getOffset()))
16057 return SDValue();
16058
16059 // Check whether folding this offset is legal. It must not go out of bounds of
16060 // the referenced object to avoid violating the code model, and must be
16061 // smaller than 2^21 because this is the largest offset expressible in all
16062 // object formats.
16063 //
16064 // This check also prevents us from folding negative offsets, which will end
16065 // up being treated in the same way as large positive ones. They could also
16066 // cause code model violations, and aren't really common enough to matter.
16067 if (Offset >= (1 << 21))
16068 return SDValue();
16069
16070 const GlobalValue *GV = GN->getGlobal();
16071 Type *T = GV->getValueType();
16072 if (!T->isSized() ||
16074 return SDValue();
16075
16076 SDLoc DL(GN);
16077 SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);
16078 return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
16079 DAG.getConstant(MinOffset, DL, MVT::i64));
16080}
16081
16082// Turns the vector of indices into a vector of byte offstes by scaling Offset
16083// by (BitWidth / 8).
16085 SDLoc DL, unsigned BitWidth) {
16086 assert(Offset.getValueType().isScalableVector() &&
16087 "This method is only for scalable vectors of offsets");
16088
16089 SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64);
16091
16093}
16094
16095/// Check if the value of \p OffsetInBytes can be used as an immediate for
16096/// the gather load/prefetch and scatter store instructions with vector base and
16097/// immediate offset addressing mode:
16098///
16099/// [<Zn>.[S|D]{, #<imm>}]
16100///
16101/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
16103 unsigned ScalarSizeInBytes) {
16104 // The immediate is not a multiple of the scalar size.
16106 return false;
16107
16108 // The immediate is out of range.
16110 return false;
16111
16112 return true;
16113}
16114
16115/// Check if the value of \p Offset represents a valid immediate for the SVE
16116/// gather load/prefetch and scatter store instructiona with vector base and
16117/// immediate offset addressing mode:
16118///
16119/// [<Zn>.[S|D]{, #<imm>}]
16120///
16121/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
16128
16130 unsigned Opcode,
16131 bool OnlyPackedOffsets = true) {
16132 const SDValue Src = N->getOperand(2);
16133 const EVT SrcVT = Src->getValueType(0);
16134 assert(SrcVT.isScalableVector() &&
16135 "Scatter stores are only possible for SVE vectors");
16136
16137 SDLoc DL(N);
16138 MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();
16139
16140 // Make sure that source data will fit into an SVE register
16141 if (SrcVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
16142 return SDValue();
16143
16144 // For FPs, ACLE only supports _packed_ single and double precision types.
16145 if (SrcElVT.isFloatingPoint())
16146 if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64))
16147 return SDValue();
16148
16149 // Depending on the addressing mode, this is either a pointer or a vector of
16150 // pointers (that fits into one register)
16151 SDValue Base = N->getOperand(4);
16152 // Depending on the addressing mode, this is either a single offset or a
16153 // vector of offsets (that fits into one register)
16154 SDValue Offset = N->getOperand(5);
16155
16156 // For "scalar + vector of indices", just scale the indices. This only
16157 // applies to non-temporal scatters because there's no instruction that takes
16158 // indicies.
16159 if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
16160 Offset =
16161 getScaledOffsetForBitWidth(DAG, Offset, DL, SrcElVT.getSizeInBits());
16162 Opcode = AArch64ISD::SSTNT1_PRED;
16163 }
16164
16165 // In the case of non-temporal gather loads there's only one SVE instruction
16166 // per data-size: "scalar + vector", i.e.
16167 // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
16168 // Since we do have intrinsics that allow the arguments to be in a different
16169 // order, we may need to swap them to match the spec.
16170 if (Opcode == AArch64ISD::SSTNT1_PRED && Offset.getValueType().isVector())
16172
16173 // SST1_IMM requires that the offset is an immediate that is:
16174 // * a multiple of #SizeInBytes,
16175 // * in the range [0, 31 x #SizeInBytes],
16176 // where #SizeInBytes is the size in bytes of the stored items. For
16177 // immediates outside that range and non-immediate scalar offsets use SST1 or
16178 // SST1_UXTW instead.
16179 if (Opcode == AArch64ISD::SST1_IMM_PRED) {
16181 SrcVT.getScalarSizeInBits() / 8)) {
16182 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
16184 else
16185 Opcode = AArch64ISD::SST1_PRED;
16186
16188 }
16189 }
16190
16191 auto &TLI = DAG.getTargetLoweringInfo();
16192 if (!TLI.isTypeLegal(Base.getValueType()))
16193 return SDValue();
16194
16195 // Some scatter store variants allow unpacked offsets, but only as nxv2i32
16196 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
16197 // nxv2i64. Legalize accordingly.
16198 if (!OnlyPackedOffsets &&
16199 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
16201
16202 if (!TLI.isTypeLegal(Offset.getValueType()))
16203 return SDValue();
16204
16205 // Source value type that is representable in hardware
16207
16208 // Keep the original type of the input data to store - this is needed to be
16209 // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
16210 // FP values we want the integer equivalent, so just use HwSrcVt.
16212 if (SrcVT.isFloatingPoint())
16214
16215 SDVTList VTs = DAG.getVTList(MVT::Other);
16217
16218 if (Src.getValueType().isFloatingPoint())
16219 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src);
16220 else
16221 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src);
16222
16223 SDValue Ops[] = {N->getOperand(0), // Chain
16224 SrcNew,
16225 N->getOperand(3), // Pg
16226 Base,
16227 Offset,
16228 InputVT};
16229
16230 return DAG.getNode(Opcode, DL, VTs, Ops);
16231}
16232
16234 unsigned Opcode,
16235 bool OnlyPackedOffsets = true) {
16236 const EVT RetVT = N->getValueType(0);
16237 assert(RetVT.isScalableVector() &&
16238 "Gather loads are only possible for SVE vectors");
16239
16240 SDLoc DL(N);
16241
16242 // Make sure that the loaded data will fit into an SVE register
16243 if (RetVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
16244 return SDValue();
16245
16246 // Depending on the addressing mode, this is either a pointer or a vector of
16247 // pointers (that fits into one register)
16248 SDValue Base = N->getOperand(3);
16249 // Depending on the addressing mode, this is either a single offset or a
16250 // vector of offsets (that fits into one register)
16251 SDValue Offset = N->getOperand(4);
16252
16253 // For "scalar + vector of indices", just scale the indices. This only
16254 // applies to non-temporal gathers because there's no instruction that takes
16255 // indicies.
16258 RetVT.getScalarSizeInBits());
16260 }
16261
16262 // In the case of non-temporal gather loads there's only one SVE instruction
16263 // per data-size: "scalar + vector", i.e.
16264 // * ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
16265 // Since we do have intrinsics that allow the arguments to be in a different
16266 // order, we may need to swap them to match the spec.
16267 if (Opcode == AArch64ISD::GLDNT1_MERGE_ZERO &&
16268 Offset.getValueType().isVector())
16270
16271 // GLD{FF}1_IMM requires that the offset is an immediate that is:
16272 // * a multiple of #SizeInBytes,
16273 // * in the range [0, 31 x #SizeInBytes],
16274 // where #SizeInBytes is the size in bytes of the loaded items. For
16275 // immediates outside that range and non-immediate scalar offsets use
16276 // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
16277 if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO ||
16280 RetVT.getScalarSizeInBits() / 8)) {
16281 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
16282 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
16285 else
16286 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
16289
16291 }
16292 }
16293
16294 auto &TLI = DAG.getTargetLoweringInfo();
16295 if (!TLI.isTypeLegal(Base.getValueType()))
16296 return SDValue();
16297
16298 // Some gather load variants allow unpacked offsets, but only as nxv2i32
16299 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
16300 // nxv2i64. Legalize accordingly.
16301 if (!OnlyPackedOffsets &&
16302 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
16304
16305 // Return value type that is representable in hardware
16307
16308 // Keep the original output value type around - this is needed to be able to
16309 // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
16310 // values we want the integer equivalent, so just use HwRetVT.
16312 if (RetVT.isFloatingPoint())
16313 OutVT = DAG.getValueType(HwRetVt);
16314
16316 SDValue Ops[] = {N->getOperand(0), // Chain
16317 N->getOperand(2), // Pg
16318 Base, Offset, OutVT};
16319
16320 SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops);
16321 SDValue LoadChain = SDValue(Load.getNode(), 1);
16322
16323 if (RetVT.isInteger() && (RetVT != HwRetVt))
16324 Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0));
16325
16326 // If the original return value was FP, bitcast accordingly. Doing it here
16327 // means that we can avoid adding TableGen patterns for FPs.
16328 if (RetVT.isFloatingPoint())
16329 Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0));
16330
16331 return DAG.getMergeValues({Load, LoadChain}, DL);
16332}
16333
16334static SDValue
16336 SelectionDAG &DAG) {
16337 SDLoc DL(N);
16338 SDValue Src = N->getOperand(0);
16339 unsigned Opc = Src->getOpcode();
16340
16341 // Sign extend of an unsigned unpack -> signed unpack
16342 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
16343
16344 unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
16346
16347 // Push the sign extend to the operand of the unpack
16348 // This is necessary where, for example, the operand of the unpack
16349 // is another unpack:
16350 // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
16351 // ->
16352 // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
16353 // ->
16354 // 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
16355 SDValue ExtOp = Src->getOperand(0);
16356 auto VT = cast<VTSDNode>(N->getOperand(1))->getVT();
16357 EVT EltTy = VT.getVectorElementType();
16358 (void)EltTy;
16359
16360 assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) &&
16361 "Sign extending from an invalid type");
16362
16364
16365 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ExtOp.getValueType(),
16366 ExtOp, DAG.getValueType(ExtVT));
16367
16368 return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
16369 }
16370
16371 if (DCI.isBeforeLegalizeOps())
16372 return SDValue();
16373
16375 return SDValue();
16376
16377 // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
16378 // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
16379 unsigned NewOpc;
16380 unsigned MemVTOpNum = 4;
16381 switch (Opc) {
16384 MemVTOpNum = 3;
16385 break;
16388 MemVTOpNum = 3;
16389 break;
16392 MemVTOpNum = 3;
16393 break;
16396 break;
16399 break;
16402 break;
16405 break;
16408 break;
16411 break;
16414 break;
16417 break;
16420 break;
16423 break;
16426 break;
16429 break;
16432 break;
16435 break;
16438 break;
16439 default:
16440 return SDValue();
16441 }
16442
16443 EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
16445
16446 if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse())
16447 return SDValue();
16448
16449 EVT DstVT = N->getValueType(0);
16450 SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);
16451
16453 for (unsigned I = 0; I < Src->getNumOperands(); ++I)
16454 Ops.push_back(Src->getOperand(I));
16455
16456 SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops);
16457 DCI.CombineTo(N, ExtLoad);
16458 DCI.CombineTo(Src.getNode(), ExtLoad, ExtLoad.getValue(1));
16459
16460 // Return N so it doesn't get rechecked
16461 return SDValue(N, 0);
16462}
16463
16464/// Legalize the gather prefetch (scalar + vector addressing mode) when the
16465/// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
16466/// != nxv2i32) do not need legalization.
16468 const unsigned OffsetPos = 4;
16469 SDValue Offset = N->getOperand(OffsetPos);
16470
16471 // Not an unpacked vector, bail out.
16472 if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
16473 return SDValue();
16474
16475 // Extend the unpacked offset vector to 64-bit lanes.
16476 SDLoc DL(N);
16478 SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
16479 // Replace the offset operand with the 64-bit one.
16480 Ops[OffsetPos] = Offset;
16481
16482 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
16483}
16484
16485/// Combines a node carrying the intrinsic
16486/// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
16487/// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
16488/// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
16489/// sve gather prefetch instruction with vector plus immediate addressing mode.
16491 unsigned ScalarSizeInBytes) {
16492 const unsigned ImmPos = 4, OffsetPos = 3;
16493 // No need to combine the node if the immediate is valid...
16495 return SDValue();
16496
16497 // ...otherwise swap the offset base with the offset...
16498 SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
16499 std::swap(Ops[ImmPos], Ops[OffsetPos]);
16500 // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
16501 // `aarch64_sve_prfb_gather_uxtw_index`.
16502 SDLoc DL(N);
16503 Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL,
16504 MVT::i64);
16505
16506 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
16507}
16508
16509// Return true if the vector operation can guarantee only the first lane of its
16510// result contains data, with all bits in other lanes set to zero.
16512 switch (Op.getOpcode()) {
16513 default:
16514 return false;
16530 return true;
16531 }
16532}
16533
16535 assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!");
16536 SDValue InsertVec = N->getOperand(0);
16537 SDValue InsertElt = N->getOperand(1);
16538 SDValue InsertIdx = N->getOperand(2);
16539
16540 // We only care about inserts into the first element...
16541 if (!isNullConstant(InsertIdx))
16542 return SDValue();
16543 // ...of a zero'd vector...
16545 return SDValue();
16546 // ...where the inserted data was previously extracted...
16547 if (InsertElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16548 return SDValue();
16549
16550 SDValue ExtractVec = InsertElt.getOperand(0);
16551 SDValue ExtractIdx = InsertElt.getOperand(1);
16552
16553 // ...from the first element of a vector.
16555 return SDValue();
16556
16557 // If we get here we are effectively trying to zero lanes 1-N of a vector.
16558
16559 // Ensure there's no type conversion going on.
16560 if (N->getValueType(0) != ExtractVec.getValueType())
16561 return SDValue();
16562
16564 return SDValue();
16565
16566 // The explicit zeroing is redundant.
16567 return ExtractVec;
16568}
16569
16570static SDValue
16577
16579 EVT Ty = N->getValueType(0);
16580 if (Ty.isInteger())
16581 return SDValue();
16582
16584 EVT ExtIntTy = getPackedSVEVectorVT(IntTy.getVectorElementCount());
16585 if (ExtIntTy.getVectorElementType().getScalarSizeInBits() <
16586 IntTy.getVectorElementType().getScalarSizeInBits())
16587 return SDValue();
16588
16589 SDLoc DL(N);
16590 SDValue LHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(0)),
16591 DL, ExtIntTy);
16592 SDValue RHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(1)),
16593 DL, ExtIntTy);
16594 SDValue Idx = N->getOperand(2);
16596 SDValue Trunc = DAG.getAnyExtOrTrunc(Splice, DL, IntTy);
16597 return DAG.getBitcast(Ty, Trunc);
16598}
16599
16601 DAGCombinerInfo &DCI) const {
16602 SelectionDAG &DAG = DCI.DAG;
16603 switch (N->getOpcode()) {
16604 default:
16605 LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
16606 break;
16607 case ISD::ADD:
16608 case ISD::SUB:
16609 return performAddSubCombine(N, DCI, DAG);
16610 case ISD::XOR:
16611 return performXorCombine(N, DAG, DCI, Subtarget);
16612 case ISD::MUL:
16613 return performMulCombine(N, DAG, DCI, Subtarget);
16614 case ISD::SINT_TO_FP:
16615 case ISD::UINT_TO_FP:
16616 return performIntToFpCombine(N, DAG, Subtarget);
16617 case ISD::FP_TO_SINT:
16618 case ISD::FP_TO_UINT:
16619 return performFpToIntCombine(N, DAG, DCI, Subtarget);
16620 case ISD::FDIV:
16621 return performFDivCombine(N, DAG, DCI, Subtarget);
16622 case ISD::OR:
16623 return performORCombine(N, DCI, Subtarget);
16624 case ISD::AND:
16625 return performANDCombine(N, DCI);
16626 case ISD::SRL:
16627 return performSRLCombine(N, DCI);
16629 return performIntrinsicCombine(N, DCI, Subtarget);
16630 case ISD::ANY_EXTEND:
16631 case ISD::ZERO_EXTEND:
16632 case ISD::SIGN_EXTEND:
16633 return performExtendCombine(N, DCI, DAG);
16635 return performSignExtendInRegCombine(N, DCI, DAG);
16636 case ISD::TRUNCATE:
16637 return performVectorTruncateCombine(N, DCI, DAG);
16639 return performConcatVectorsCombine(N, DCI, DAG);
16640 case ISD::SELECT:
16641 return performSelectCombine(N, DCI);
16642 case ISD::VSELECT:
16643 return performVSelectCombine(N, DCI.DAG);
16644 case ISD::SETCC:
16645 return performSETCCCombine(N, DAG);
16646 case ISD::LOAD:
16647 if (performTBISimplification(N->getOperand(1), DCI, DAG))
16648 return SDValue(N, 0);
16649 break;
16650 case ISD::STORE:
16651 return performSTORECombine(N, DCI, DAG, Subtarget);
16652 case ISD::VECTOR_SPLICE:
16653 return performSVESpliceCombine(N, DAG);
16654 case AArch64ISD::BRCOND:
16655 return performBRCONDCombine(N, DCI, DAG);
16656 case AArch64ISD::TBNZ:
16657 case AArch64ISD::TBZ:
16658 return performTBZCombine(N, DCI, DAG);
16659 case AArch64ISD::CSEL:
16660 return performCSELCombine(N, DCI, DAG);
16661 case AArch64ISD::DUP:
16662 return performPostLD1Combine(N, DCI, false);
16663 case AArch64ISD::NVCAST:
16664 return performNVCASTCombine(N);
16665 case AArch64ISD::SPLICE:
16666 return performSpliceCombine(N, DAG);
16667 case AArch64ISD::UZP1:
16668 return performUzpCombine(N, DAG);
16670 return performSetccMergeZeroCombine(N, DAG);
16685 return performGLD1Combine(N, DAG);
16686 case AArch64ISD::VASHR:
16687 case AArch64ISD::VLSHR:
16688 return performVectorShiftCombine(N, *this, DCI);
16692 return performExtractVectorEltCombine(N, DAG);
16693 case ISD::VECREDUCE_ADD:
16694 return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
16697 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
16698 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
16699 return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/);
16700 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
16701 return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/);
16702 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
16703 return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/);
16704 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
16705 return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/);
16706 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
16707 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
16708 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
16709 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
16710 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
16711 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
16712 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
16713 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
16715 case Intrinsic::aarch64_neon_ld2:
16716 case Intrinsic::aarch64_neon_ld3:
16717 case Intrinsic::aarch64_neon_ld4:
16718 case Intrinsic::aarch64_neon_ld1x2:
16719 case Intrinsic::aarch64_neon_ld1x3:
16720 case Intrinsic::aarch64_neon_ld1x4:
16721 case Intrinsic::aarch64_neon_ld2lane:
16722 case Intrinsic::aarch64_neon_ld3lane:
16723 case Intrinsic::aarch64_neon_ld4lane:
16724 case Intrinsic::aarch64_neon_ld2r:
16725 case Intrinsic::aarch64_neon_ld3r:
16726 case Intrinsic::aarch64_neon_ld4r:
16727 case Intrinsic::aarch64_neon_st2:
16728 case Intrinsic::aarch64_neon_st3:
16729 case Intrinsic::aarch64_neon_st4:
16730 case Intrinsic::aarch64_neon_st1x2:
16731 case Intrinsic::aarch64_neon_st1x3:
16732 case Intrinsic::aarch64_neon_st1x4:
16733 case Intrinsic::aarch64_neon_st2lane:
16734 case Intrinsic::aarch64_neon_st3lane:
16735 case Intrinsic::aarch64_neon_st4lane:
16736 return performNEONPostLDSTCombine(N, DCI, DAG);
16737 case Intrinsic::aarch64_sve_ldnt1:
16738 return performLDNT1Combine(N, DAG);
16739 case Intrinsic::aarch64_sve_ld1rq:
16741 case Intrinsic::aarch64_sve_ld1ro:
16743 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
16745 case Intrinsic::aarch64_sve_ldnt1_gather:
16747 case Intrinsic::aarch64_sve_ldnt1_gather_index:
16748 return performGatherLoadCombine(N, DAG,
16750 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
16752 case Intrinsic::aarch64_sve_ld1:
16754 case Intrinsic::aarch64_sve_ldnf1:
16756 case Intrinsic::aarch64_sve_ldff1:
16758 case Intrinsic::aarch64_sve_st1:
16759 return performST1Combine(N, DAG);
16760 case Intrinsic::aarch64_sve_stnt1:
16761 return performSTNT1Combine(N, DAG);
16762 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
16764 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
16766 case Intrinsic::aarch64_sve_stnt1_scatter:
16768 case Intrinsic::aarch64_sve_stnt1_scatter_index:
16770 case Intrinsic::aarch64_sve_ld1_gather:
16772 case Intrinsic::aarch64_sve_ld1_gather_index:
16773 return performGatherLoadCombine(N, DAG,
16775 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
16777 /*OnlyPackedOffsets=*/false);
16778 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
16780 /*OnlyPackedOffsets=*/false);
16781 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
16782 return performGatherLoadCombine(N, DAG,
16784 /*OnlyPackedOffsets=*/false);
16785 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
16786 return performGatherLoadCombine(N, DAG,
16788 /*OnlyPackedOffsets=*/false);
16789 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
16791 case Intrinsic::aarch64_sve_ldff1_gather:
16793 case Intrinsic::aarch64_sve_ldff1_gather_index:
16794 return performGatherLoadCombine(N, DAG,
16796 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
16797 return performGatherLoadCombine(N, DAG,
16799 /*OnlyPackedOffsets=*/false);
16800 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
16801 return performGatherLoadCombine(N, DAG,
16803 /*OnlyPackedOffsets=*/false);
16804 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
16805 return performGatherLoadCombine(N, DAG,
16807 /*OnlyPackedOffsets=*/false);
16808 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
16809 return performGatherLoadCombine(N, DAG,
16811 /*OnlyPackedOffsets=*/false);
16812 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
16813 return performGatherLoadCombine(N, DAG,
16815 case Intrinsic::aarch64_sve_st1_scatter:
16817 case Intrinsic::aarch64_sve_st1_scatter_index:
16819 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
16821 /*OnlyPackedOffsets=*/false);
16822 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
16824 /*OnlyPackedOffsets=*/false);
16825 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
16826 return performScatterStoreCombine(N, DAG,
16828 /*OnlyPackedOffsets=*/false);
16829 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
16830 return performScatterStoreCombine(N, DAG,
16832 /*OnlyPackedOffsets=*/false);
16833 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
16835 case Intrinsic::aarch64_sve_tuple_get: {
16836 SDLoc DL(N);
16837 SDValue Chain = N->getOperand(0);
16838 SDValue Src1 = N->getOperand(2);
16839 SDValue Idx = N->getOperand(3);
16840
16841 uint64_t IdxConst = cast<ConstantSDNode>(Idx)->getZExtValue();
16842 EVT ResVT = N->getValueType(0);
16843 uint64_t NumLanes = ResVT.getVectorElementCount().getKnownMinValue();
16845 SDValue Val =
16847 return DAG.getMergeValues({Val, Chain}, DL);
16848 }
16849 case Intrinsic::aarch64_sve_tuple_set: {
16850 SDLoc DL(N);
16851 SDValue Chain = N->getOperand(0);
16852 SDValue Tuple = N->getOperand(2);
16853 SDValue Idx = N->getOperand(3);
16854 SDValue Vec = N->getOperand(4);
16855
16856 EVT TupleVT = Tuple.getValueType();
16857 uint64_t TupleLanes = TupleVT.getVectorElementCount().getKnownMinValue();
16858
16859 uint64_t IdxConst = cast<ConstantSDNode>(Idx)->getZExtValue();
16860 uint64_t NumLanes =
16862
16863 if ((TupleLanes % NumLanes) != 0)
16864 report_fatal_error("invalid tuple vector!");
16865
16866 uint64_t NumVecs = TupleLanes / NumLanes;
16867
16869 for (unsigned I = 0; I < NumVecs; ++I) {
16870 if (I == IdxConst)
16871 Opnds.push_back(Vec);
16872 else {
16874 Opnds.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL,
16875 Vec.getValueType(), Tuple, ExtIdx));
16876 }
16877 }
16878 SDValue Concat =
16879 DAG.getNode(ISD::CONCAT_VECTORS, DL, Tuple.getValueType(), Opnds);
16880 return DAG.getMergeValues({Concat, Chain}, DL);
16881 }
16882 case Intrinsic::aarch64_sve_tuple_create2:
16883 case Intrinsic::aarch64_sve_tuple_create3:
16884 case Intrinsic::aarch64_sve_tuple_create4: {
16885 SDLoc DL(N);
16886 SDValue Chain = N->getOperand(0);
16887
16889 for (unsigned I = 2; I < N->getNumOperands(); ++I)
16890 Opnds.push_back(N->getOperand(I));
16891
16892 EVT VT = Opnds[0].getValueType();
16896 (N->getNumOperands() - 2));
16898 return DAG.getMergeValues({Concat, Chain}, DL);
16899 }
16900 case Intrinsic::aarch64_sve_ld2:
16901 case Intrinsic::aarch64_sve_ld3:
16902 case Intrinsic::aarch64_sve_ld4: {
16903 SDLoc DL(N);
16904 SDValue Chain = N->getOperand(0);
16905 SDValue Mask = N->getOperand(2);
16906 SDValue BasePtr = N->getOperand(3);
16907 SDValue LoadOps[] = {Chain, Mask, BasePtr};
16908 unsigned IntrinsicID =
16909 cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
16910 SDValue Result =
16911 LowerSVEStructLoad(IntrinsicID, LoadOps, N->getValueType(0), DAG, DL);
16912 return DAG.getMergeValues({Result, Chain}, DL);
16913 }
16914 case Intrinsic::aarch64_rndr:
16915 case Intrinsic::aarch64_rndrrs: {
16916 unsigned IntrinsicID =
16917 cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
16918 auto Register =
16919 (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
16920 : AArch64SysReg::RNDRRS);
16921 SDLoc DL(N);
16922 SDValue A = DAG.getNode(
16924 N->getOperand(0), DAG.getConstant(Register, DL, MVT::i64));
16925 SDValue B = DAG.getNode(
16927 DAG.getConstant(0, DL, MVT::i32),
16928 DAG.getConstant(AArch64CC::NE, DL, MVT::i32), A.getValue(1));
16929 return DAG.getMergeValues(
16930 {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
16931 }
16932 default:
16933 break;
16934 }
16935 break;
16936 case ISD::GlobalAddress:
16937 return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
16938 }
16939 return SDValue();
16940}
16941
16942// Check if the return value is used as only a return value, as otherwise
16943// we can't perform a tail-call. In particular, we need to check for
16944// target ISD nodes that are returns and any other "odd" constructs
16945// that the generic analysis code won't necessarily catch.
16946bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
16947 SDValue &Chain) const {
16948 if (N->getNumValues() != 1)
16949 return false;
16950 if (!N->hasNUsesOfValue(1, 0))
16951 return false;
16952
16953 SDValue TCChain = Chain;
16954 SDNode *Copy = *N->use_begin();
16955 if (Copy->getOpcode() == ISD::CopyToReg) {
16956 // If the copy has a glue operand, we conservatively assume it isn't safe to
16957 // perform a tail call.
16958 if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
16959 MVT::Glue)
16960 return false;
16961 TCChain = Copy->getOperand(0);
16962 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
16963 return false;
16964
16965 bool HasRet = false;
16966 for (SDNode *Node : Copy->uses()) {
16967 if (Node->getOpcode() != AArch64ISD::RET_FLAG)
16968 return false;
16969 HasRet = true;
16970 }
16971
16972 if (!HasRet)
16973 return false;
16974
16975 Chain = TCChain;
16976 return true;
16977}
16978
16979// Return whether the an instruction can potentially be optimized to a tail
16980// call. This will cause the optimizers to attempt to move, or duplicate,
16981// return instructions to help enable tail call optimizations for this
16982// instruction.
16983bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
16984 return CI->isTailCall();
16985}
16986
16987bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,
16988 SDValue &Offset,
16990 bool &IsInc,
16991 SelectionDAG &DAG) const {
16992 if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
16993 return false;
16994
16995 Base = Op->getOperand(0);
16996 // All of the indexed addressing mode instructions take a signed
16997 // 9 bit immediate offset.
16998 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
16999 int64_t RHSC = RHS->getSExtValue();
17000 if (Op->getOpcode() == ISD::SUB)
17001 RHSC = -(uint64_t)RHSC;
17002 if (!isInt<9>(RHSC))
17003 return false;
17004 IsInc = (Op->getOpcode() == ISD::ADD);
17005 Offset = Op->getOperand(1);
17006 return true;
17007 }
17008 return false;
17009}
17010
17011bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
17012 SDValue &Offset,
17014 SelectionDAG &DAG) const {
17015 EVT VT;
17016 SDValue Ptr;
17017 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
17018 VT = LD->getMemoryVT();
17019 Ptr = LD->getBasePtr();
17020 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
17021 VT = ST->getMemoryVT();
17022 Ptr = ST->getBasePtr();
17023 } else
17024 return false;
17025
17026 bool IsInc;
17027 if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG))
17028 return false;
17030 return true;
17031}
17032
17033bool AArch64TargetLowering::getPostIndexedAddressParts(
17035 ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
17036 EVT VT;
17037 SDValue Ptr;
17038 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
17039 VT = LD->getMemoryVT();
17040 Ptr = LD->getBasePtr();
17041 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
17042 VT = ST->getMemoryVT();
17043 Ptr = ST->getBasePtr();
17044 } else
17045 return false;
17046
17047 bool IsInc;
17048 if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG))
17049 return false;
17050 // Post-indexing updates the base, so it's not a valid transform
17051 // if that's not the same as the load's pointer.
17052 if (Ptr != Base)
17053 return false;
17055 return true;
17056}
17057
17058void AArch64TargetLowering::ReplaceBITCASTResults(
17060 SDLoc DL(N);
17061 SDValue Op = N->getOperand(0);
17062 EVT VT = N->getValueType(0);
17063 EVT SrcVT = Op.getValueType();
17064
17065 if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(SrcVT)) {
17066 assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
17067 "Expected fp->int bitcast!");
17068 SDValue CastResult = getSVESafeBitCast(getSVEContainerType(VT), Op, DAG);
17069 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, CastResult));
17070 return;
17071 }
17072
17073 if (VT != MVT::i16 || (SrcVT != MVT::f16 && SrcVT != MVT::bf16))
17074 return;
17075
17076 Op = SDValue(
17077 DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32,
17078 DAG.getUNDEF(MVT::i32), Op,
17079 DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
17080 0);
17081 Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
17082 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
17083}
17084
17087 SelectionDAG &DAG, unsigned InterOp,
17088 unsigned AcrossOp) {
17089 EVT LoVT, HiVT;
17090 SDValue Lo, Hi;
17091 SDLoc dl(N);
17092 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
17093 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
17094 SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi);
17096 Results.push_back(SplitVal);
17097}
17098
17099static std::pair<SDValue, SDValue> splitInt128(SDValue N, SelectionDAG &DAG) {
17100 SDLoc DL(N);
17103 DAG.getNode(ISD::SRL, DL, MVT::i128, N,
17104 DAG.getConstant(64, DL, MVT::i64)));
17105 return std::make_pair(Lo, Hi);
17106}
17107
17108void AArch64TargetLowering::ReplaceExtractSubVectorResults(
17110 SDValue In = N->getOperand(0);
17111 EVT InVT = In.getValueType();
17112
17113 // Common code will handle these just fine.
17114 if (!InVT.isScalableVector() || !InVT.isInteger())
17115 return;
17116
17117 SDLoc DL(N);
17118 EVT VT = N->getValueType(0);
17119
17120 // The following checks bail if this is not a halving operation.
17121
17123
17124 if (InVT.getVectorElementCount() != (ResEC * 2))
17125 return;
17126
17127 auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1));
17128 if (!CIndex)
17129 return;
17130
17131 unsigned Index = CIndex->getZExtValue();
17132 if ((Index != 0) && (Index != ResEC.getKnownMinValue()))
17133 return;
17134
17135 unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
17137
17138 SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0));
17139 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half));
17140}
17141
17142// Create an even/odd pair of X registers holding integer value V.
17144 SDLoc dl(V.getNode());
17145 SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i64);
17147 DAG.getNode(ISD::SRL, dl, MVT::i128, V, DAG.getConstant(64, dl, MVT::i64)),
17148 dl, MVT::i64);
17149 if (DAG.getDataLayout().isBigEndian())
17150 std::swap (VLo, VHi);
17151 SDValue RegClass =
17152 DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32);
17153 SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32);
17154 SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32);
17155 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
17156 return SDValue(
17157 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
17158}
17159
17162 SelectionDAG &DAG,
17163 const AArch64Subtarget *Subtarget) {
17164 assert(N->getValueType(0) == MVT::i128 &&
17165 "AtomicCmpSwap on types less than 128 should be legal");
17166
17167 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
17168 if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) {
17169 // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
17170 // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
17171 SDValue Ops[] = {
17172 createGPRPairNode(DAG, N->getOperand(2)), // Compare value
17173 createGPRPairNode(DAG, N->getOperand(3)), // Store value
17174 N->getOperand(1), // Ptr
17175 N->getOperand(0), // Chain in
17176 };
17177
17178 unsigned Opcode;
17179 switch (MemOp->getMergedOrdering()) {
17181 Opcode = AArch64::CASPX;
17182 break;
17184 Opcode = AArch64::CASPAX;
17185 break;
17187 Opcode = AArch64::CASPLX;
17188 break;
17191 Opcode = AArch64::CASPALX;
17192 break;
17193 default:
17194 llvm_unreachable("Unexpected ordering!");
17195 }
17196
17198 Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
17200
17201 unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
17202 if (DAG.getDataLayout().isBigEndian())
17205 SDValue(CmpSwap, 0));
17207 SDValue(CmpSwap, 0));
17208 Results.push_back(
17209 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
17210 Results.push_back(SDValue(CmpSwap, 1)); // Chain out
17211 return;
17212 }
17213
17214 unsigned Opcode;
17215 switch (MemOp->getMergedOrdering()) {
17217 Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
17218 break;
17220 Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
17221 break;
17223 Opcode = AArch64::CMP_SWAP_128_RELEASE;
17224 break;
17227 Opcode = AArch64::CMP_SWAP_128;
17228 break;
17229 default:
17230 llvm_unreachable("Unexpected ordering!");
17231 }
17232
17233 auto Desired = splitInt128(N->getOperand(2), DAG);
17234 auto New = splitInt128(N->getOperand(3), DAG);
17235 SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
17236 New.first, New.second, N->getOperand(0)};
17239 Ops);
17241
17243 SDValue(CmpSwap, 0), SDValue(CmpSwap, 1)));
17244 Results.push_back(SDValue(CmpSwap, 3));
17245}
17246
17247void AArch64TargetLowering::ReplaceNodeResults(
17249 switch (N->getOpcode()) {
17250 default:
17251 llvm_unreachable("Don't know how to custom expand this");
17252 case ISD::BITCAST:
17253 ReplaceBITCASTResults(N, Results, DAG);
17254 return;
17255 case ISD::VECREDUCE_ADD:
17260 Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
17261 return;
17262
17263 case ISD::CTPOP:
17264 if (SDValue Result = LowerCTPOP(SDValue(N, 0), DAG))
17265 Results.push_back(Result);
17266 return;
17267 case AArch64ISD::SADDV:
17269 return;
17270 case AArch64ISD::UADDV:
17272 return;
17273 case AArch64ISD::SMINV:
17275 return;
17276 case AArch64ISD::UMINV:
17278 return;
17279 case AArch64ISD::SMAXV:
17281 return;
17282 case AArch64ISD::UMAXV:
17284 return;
17285 case ISD::FP_TO_UINT:
17286 case ISD::FP_TO_SINT:
17287 assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
17288 // Let normal code take care of it by not adding anything to Results.
17289 return;
17291 ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
17292 return;
17293 case ISD::LOAD: {
17295 "unexpected load's value type");
17297 if (!LoadNode->isVolatile() || LoadNode->getMemoryVT() != MVT::i128) {
17298 // Non-volatile loads are optimized later in AArch64's load/store
17299 // optimizer.
17300 return;
17301 }
17302
17305 DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
17306 {LoadNode->getChain(), LoadNode->getBasePtr()}, LoadNode->getMemoryVT(),
17307 LoadNode->getMemOperand());
17308
17310 Result.getValue(0), Result.getValue(1));
17311 Results.append({Pair, Result.getValue(2) /* Chain */});
17312 return;
17313 }
17315 ReplaceExtractSubVectorResults(N, Results, DAG);
17316 return;
17318 // Custom lowering has been requested for INSERT_SUBVECTOR -- but delegate
17319 // to common code for result type legalisation
17320 return;
17322 EVT VT = N->getValueType(0);
17323 assert((VT == MVT::i8 || VT == MVT::i16) &&
17324 "custom lowering for unexpected type");
17325
17326 ConstantSDNode *CN = cast<ConstantSDNode>(N->getOperand(0));
17327 Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
17328 switch (IntID) {
17329 default:
17330 return;
17331 case Intrinsic::aarch64_sve_clasta_n: {
17332 SDLoc DL(N);
17333 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
17334 auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
17335 N->getOperand(1), Op2, N->getOperand(3));
17336 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
17337 return;
17338 }
17339 case Intrinsic::aarch64_sve_clastb_n: {
17340 SDLoc DL(N);
17341 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
17342 auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
17343 N->getOperand(1), Op2, N->getOperand(3));
17344 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
17345 return;
17346 }
17347 case Intrinsic::aarch64_sve_lasta: {
17348 SDLoc DL(N);
17349 auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
17350 N->getOperand(1), N->getOperand(2));
17351 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
17352 return;
17353 }
17354 case Intrinsic::aarch64_sve_lastb: {
17355 SDLoc DL(N);
17356 auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
17357 N->getOperand(1), N->getOperand(2));
17358 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
17359 return;
17360 }
17361 }
17362 }
17363 }
17364}
17365
17367 if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())
17369 return true;
17370}
17371
17372unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
17373 // Combine multiple FDIVs with the same divisor into multiple FMULs by the
17374 // reciprocal if there are three or more FDIVs.
17375 return 3;
17376}
17377
17380 // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
17381 // v4i16, v2i32 instead of to promote.
17382 if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 ||
17383 VT == MVT::v1f32)
17384 return TypeWidenVector;
17385
17387}
17388
17389// Loads and stores less than 128-bits are already atomic; ones above that
17390// are doomed anyway, so defer to the default libcall and blame the OS when
17391// things go wrong.
17393 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
17394 return Size == 128;
17395}
17396
17397// Loads and stores less than 128-bits are already atomic; ones above that
17398// are doomed anyway, so defer to the default libcall and blame the OS when
17399// things go wrong.
17405
17406// For the real atomic operations, we have ldxr/stxr up to 128 bits,
17409 if (AI->isFloatingPointOperation())
17411
17412 unsigned Size = AI->getType()->getPrimitiveSizeInBits();
17413 if (Size > 128) return AtomicExpansionKind::None;
17414
17415 // Nand is not supported in LSE.
17416 // Leave 128 bits to LLSC or CmpXChg.
17417 if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128) {
17418 if (Subtarget->hasLSE())
17420 if (Subtarget->outlineAtomics()) {
17421 // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
17422 // Don't outline them unless
17423 // (1) high level <atomic> support approved:
17424 // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
17425 // (2) low level libgcc and compiler-rt support implemented by:
17426 // min/max outline atomics helpers
17427 if (AI->getOperation() != AtomicRMWInst::Min &&
17432 }
17433 }
17434 }
17435
17436 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
17437 // implement atomicrmw without spilling. If the target address is also on the
17438 // stack and close enough to the spill slot, this can lead to a situation
17439 // where the monitor always gets cleared and the atomic operation can never
17440 // succeed. So at -O0 lower this operation to a CAS loop.
17441 if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
17443
17445}
17446
17449 AtomicCmpXchgInst *AI) const {
17450 // If subtarget has LSE, leave cmpxchg intact for codegen.
17451 if (Subtarget->hasLSE() || Subtarget->outlineAtomics())
17453 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
17454 // implement cmpxchg without spilling. If the address being exchanged is also
17455 // on the stack and close enough to the spill slot, this can lead to a
17456 // situation where the monitor always gets cleared and the atomic operation
17457 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
17458 if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
17460
17461 // 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand
17462 // it.
17464 if (Size > 64)
17466
17468}
17469
17471 Type *ValueTy, Value *Addr,
17472 AtomicOrdering Ord) const {
17473 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
17474 bool IsAcquire = isAcquireOrStronger(Ord);
17475
17476 // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
17477 // intrinsic must return {i64, i64} and we have to recombine them into a
17478 // single i128 here.
17479 if (ValueTy->getPrimitiveSizeInBits() == 128) {
17481 IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
17483
17484 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
17485 Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
17486
17487 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
17488 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
17489 Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
17490 Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
17491 return Builder.CreateOr(
17492 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 64)), "val64");
17493 }
17494
17495 Type *Tys[] = { Addr->getType() };
17497 IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
17499
17500 const DataLayout &DL = M->getDataLayout();
17501 IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy));
17502 Value *Trunc = Builder.CreateTrunc(Builder.CreateCall(Ldxr, Addr), IntEltTy);
17503
17504 return Builder.CreateBitCast(Trunc, ValueTy);
17505}
17506
17508 IRBuilderBase &Builder) const {
17509 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
17510 Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
17511}
17512
17514 Value *Val, Value *Addr,
17515 AtomicOrdering Ord) const {
17516 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
17517 bool IsRelease = isReleaseOrStronger(Ord);
17518
17519 // Since the intrinsics must have legal type, the i128 intrinsics take two
17520 // parameters: "i64, i64". We must marshal Val into the appropriate form
17521 // before the call.
17522 if (Val->getType()->getPrimitiveSizeInBits() == 128) {
17524 IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
17526 Type *Int64Ty = Type::getInt64Ty(M->getContext());
17527
17528 Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
17529 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
17530 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
17531 return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
17532 }
17533
17535 IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
17536 Type *Tys[] = { Addr->getType() };
17538
17539 const DataLayout &DL = M->getDataLayout();
17540 IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
17541 Val = Builder.CreateBitCast(Val, IntValTy);
17542
17543 return Builder.CreateCall(Stxr,
17544 {Builder.CreateZExtOrBitCast(
17545 Val, Stxr->getFunctionType()->getParamType(0)),
17546 Addr});
17547}
17548
17550 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
17551 const DataLayout &DL) const {
17552 if (!Ty->isArrayTy()) {
17553 const TypeSize &TySize = Ty->getPrimitiveSizeInBits();
17554 return TySize.isScalable() && TySize.getKnownMinSize() > 128;
17555 }
17556
17557 // All non aggregate members of the type must have the same type
17558 SmallVector<EVT> ValueVTs;
17559 ComputeValueVTs(*this, DL, Ty, ValueVTs);
17560 return is_splat(ValueVTs);
17561}
17562
17563bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
17564 EVT) const {
17565 return false;
17566}
17567
17568static Value *UseTlsOffset(IRBuilderBase &IRB, unsigned Offset) {
17569 Module *M = IRB.GetInsertBlock()->getParent()->getParent();
17571 Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
17572 return IRB.CreatePointerCast(
17574 Offset),
17575 IRB.getInt8PtrTy()->getPointerTo(0));
17576}
17577
17579 // Android provides a fixed TLS slot for the stack cookie. See the definition
17580 // of TLS_SLOT_STACK_GUARD in
17581 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
17582 if (Subtarget->isTargetAndroid())
17583 return UseTlsOffset(IRB, 0x28);
17584
17585 // Fuchsia is similar.
17586 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
17587 if (Subtarget->isTargetFuchsia())
17588 return UseTlsOffset(IRB, -0x10);
17589
17591}
17592
17594 // MSVC CRT provides functionalities for stack protection.
17595 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) {
17596 // MSVC CRT has a global variable holding security cookie.
17597 M.getOrInsertGlobal("__security_cookie",
17598 Type::getInt8PtrTy(M.getContext()));
17599
17600 // MSVC CRT has a function to validate security cookie.
17601 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
17602 "__security_check_cookie", Type::getVoidTy(M.getContext()),
17603 Type::getInt8PtrTy(M.getContext()));
17604 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
17605 F->setCallingConv(CallingConv::Win64);
17606 F->addAttribute(1, Attribute::AttrKind::InReg);
17607 }
17608 return;
17609 }
17611}
17612
17614 // MSVC CRT has a global variable holding security cookie.
17615 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
17616 return M.getGlobalVariable("__security_cookie");
17618}
17619
17621 // MSVC CRT has a function to validate security cookie.
17622 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
17623 return M.getFunction("__security_check_cookie");
17625}
17626
17627Value *
17629 // Android provides a fixed TLS slot for the SafeStack pointer. See the
17630 // definition of TLS_SLOT_SAFESTACK in
17631 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
17632 if (Subtarget->isTargetAndroid())
17633 return UseTlsOffset(IRB, 0x48);
17634
17635 // Fuchsia is similar.
17636 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
17637 if (Subtarget->isTargetFuchsia())
17638 return UseTlsOffset(IRB, -0x8);
17639
17641}
17642
17644 const Instruction &AndI) const {
17645 // Only sink 'and' mask to cmp use block if it is masking a single bit, since
17646 // this is likely to be fold the and/cmp/br into a single tbz instruction. It
17647 // may be beneficial to sink in other cases, but we would have to check that
17648 // the cmp would not get folded into the br to form a cbz for these to be
17649 // beneficial.
17650 ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
17651 if (!Mask)
17652 return false;
17653 return Mask->getValue().isPowerOf2();
17654}
17655
17659 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
17660 SelectionDAG &DAG) const {
17661 // Does baseline recommend not to perform the fold by default?
17663 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
17664 return false;
17665 // Else, if this is a vector shift, prefer 'shl'.
17666 return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
17667}
17668
17670 SDNode *N) const {
17672 !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
17673 return false;
17674 return true;
17675}
17676
17678 // Update IsSplitCSR in AArch64unctionInfo.
17679 AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
17680 AFI->setIsSplitCSR(true);
17681}
17682
17684 MachineBasicBlock *Entry,
17685 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
17686 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
17687 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
17688 if (!IStart)
17689 return;
17690
17691 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
17692 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
17694 for (const MCPhysReg *I = IStart; *I; ++I) {
17695 const TargetRegisterClass *RC = nullptr;
17696 if (AArch64::GPR64RegClass.contains(*I))
17697 RC = &AArch64::GPR64RegClass;
17698 else if (AArch64::FPR64RegClass.contains(*I))
17699 RC = &AArch64::FPR64RegClass;
17700 else
17701 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
17702
17703 Register NewVR = MRI->createVirtualRegister(RC);
17704 // Create copy from CSR to a virtual register.
17705 // FIXME: this currently does not emit CFI pseudo-instructions, it works
17706 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
17707 // nounwind. If we want to generalize this later, we may need to emit
17708 // CFI pseudo-instructions.
17709 assert(Entry->getParent()->getFunction().hasFnAttribute(
17710 Attribute::NoUnwind) &&
17711 "Function should be nounwind in insertCopiesSplitCSR!");
17712 Entry->addLiveIn(*I);
17713 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
17714 .addReg(*I);
17715
17716 // Insert the copy-back instructions right before the terminator.
17717 for (auto *Exit : Exits)
17718 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
17719 TII->get(TargetOpcode::COPY), *I)
17720 .addReg(NewVR);
17721 }
17722}
17723
17725 // Integer division on AArch64 is expensive. However, when aggressively
17726 // optimizing for code size, we prefer to use a div instruction, as it is
17727 // usually smaller than the alternative sequence.
17728 // The exception to this is vector division. Since AArch64 doesn't have vector
17729 // integer division, leaving the division as-is is a loss even in terms of
17730 // size, because it will have to be scalarized, while the alternative code
17731 // sequence can be performed in vector form.
17732 bool OptSize = Attr.hasFnAttribute(Attribute::MinSize);
17733 return OptSize && !VT.isVector();
17734}
17735
17737 // We want inc-of-add for scalars and sub-of-not for vectors.
17738 return VT.isScalarInteger();
17739}
17740
17742 return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
17743}
17744
17745unsigned
17747 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
17748 return getPointerTy(DL).getSizeInBits();
17749
17750 return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
17751}
17752
17753void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
17756}
17757
17758// Unlike X86, we let frame lowering assign offsets to all catch objects.
17760 return false;
17761}
17762
17763bool AArch64TargetLowering::shouldLocalize(
17764 const MachineInstr &MI, const TargetTransformInfo *TTI) const {
17765 switch (MI.getOpcode()) {
17766 case TargetOpcode::G_GLOBAL_VALUE: {
17767 // On Darwin, TLS global vars get selected into function calls, which
17768 // we don't want localized, as they can get moved into the middle of a
17769 // another call sequence.
17770 const GlobalValue &GV = *MI.getOperand(1).getGlobal();
17771 if (GV.isThreadLocal() && Subtarget->isTargetMachO())
17772 return false;
17773 break;
17774 }
17775 // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
17776 // localizable.
17777 case AArch64::ADRP:
17778 case AArch64::G_ADD_LOW:
17779 return true;
17780 default:
17781 break;
17782 }
17784}
17785
17787 if (isa<ScalableVectorType>(Inst.getType()))
17788 return true;
17789
17790 for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
17792 return true;
17793
17794 if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
17795 if (isa<ScalableVectorType>(AI->getAllocatedType()))
17796 return true;
17797 }
17798
17799 return false;
17800}
17801
17802// Return the largest legal scalable vector type that matches VT's element type.
17806 "Expected legal fixed length vector!");
17807 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
17808 default:
17809 llvm_unreachable("unexpected element type for SVE container");
17810 case MVT::i8:
17811 return EVT(MVT::nxv16i8);
17812 case MVT::i16:
17813 return EVT(MVT::nxv8i16);
17814 case MVT::i32:
17815 return EVT(MVT::nxv4i32);
17816 case MVT::i64:
17817 return EVT(MVT::nxv2i64);
17818 case MVT::f16:
17819 return EVT(MVT::nxv8f16);
17820 case MVT::f32:
17821 return EVT(MVT::nxv4f32);
17822 case MVT::f64:
17823 return EVT(MVT::nxv2f64);
17824 }
17825}
17826
17827// Return a PTRUE with active lanes corresponding to the extent of VT.
17829 EVT VT) {
17832 "Expected legal fixed length vector!");
17833
17834 int PgPattern;
17835 switch (VT.getVectorNumElements()) {
17836 default:
17837 llvm_unreachable("unexpected element count for SVE predicate");
17838 case 1:
17839 PgPattern = AArch64SVEPredPattern::vl1;
17840 break;
17841 case 2:
17842 PgPattern = AArch64SVEPredPattern::vl2;
17843 break;
17844 case 4:
17845 PgPattern = AArch64SVEPredPattern::vl4;
17846 break;
17847 case 8:
17848 PgPattern = AArch64SVEPredPattern::vl8;
17849 break;
17850 case 16:
17851 PgPattern = AArch64SVEPredPattern::vl16;
17852 break;
17853 case 32:
17854 PgPattern = AArch64SVEPredPattern::vl32;
17855 break;
17856 case 64:
17857 PgPattern = AArch64SVEPredPattern::vl64;
17858 break;
17859 case 128:
17860 PgPattern = AArch64SVEPredPattern::vl128;
17861 break;
17862 case 256:
17863 PgPattern = AArch64SVEPredPattern::vl256;
17864 break;
17865 }
17866
17867 // TODO: For vectors that are exactly getMaxSVEVectorSizeInBits big, we can
17868 // use AArch64SVEPredPattern::all, which can enable the use of unpredicated
17869 // variants of instructions when available.
17870
17871 MVT MaskVT;
17872 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
17873 default:
17874 llvm_unreachable("unexpected element type for SVE predicate");
17875 case MVT::i8:
17877 break;
17878 case MVT::i16:
17879 case MVT::f16:
17881 break;
17882 case MVT::i32:
17883 case MVT::f32:
17885 break;
17886 case MVT::i64:
17887 case MVT::f64:
17889 break;
17890 }
17891
17892 return DAG.getNode(AArch64ISD::PTRUE, DL, MaskVT,
17894}
17895
17897 EVT VT) {
17899 "Expected legal scalable vector!");
17901 return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);
17902}
17903
17905 if (VT.isFixedLengthVector())
17906 return getPredicateForFixedLengthVector(DAG, DL, VT);
17907
17908 return getPredicateForScalableVector(DAG, DL, VT);
17909}
17910
17911// Grow V to consume an entire SVE register.
17913 assert(VT.isScalableVector() &&
17914 "Expected to convert into a scalable vector!");
17915 assert(V.getValueType().isFixedLengthVector() &&
17916 "Expected a fixed length vector operand!");
17917 SDLoc DL(V);
17918 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
17919 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
17920}
17921
17922// Shrink V so it's just big enough to maintain a VT's worth of data.
17925 "Expected to convert into a fixed length vector!");
17926 assert(V.getValueType().isScalableVector() &&
17927 "Expected a scalable vector operand!");
17928 SDLoc DL(V);
17929 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
17930 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
17931}
17932
17933// Convert all fixed length vector loads larger than NEON to masked_loads.
17934SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
17935 SDValue Op, SelectionDAG &DAG) const {
17936 auto Load = cast<LoadSDNode>(Op);
17937
17938 SDLoc DL(Op);
17939 EVT VT = Op.getValueType();
17941
17942 auto NewLoad = DAG.getMaskedLoad(
17943 ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
17945 Load->getMemoryVT(), Load->getMemOperand(), Load->getAddressingMode(),
17946 Load->getExtensionType());
17947
17948 auto Result = convertFromScalableVector(DAG, VT, NewLoad);
17949 SDValue MergedValues[2] = {Result, Load->getChain()};
17950 return DAG.getMergeValues(MergedValues, DL);
17951}
17952
17954 SelectionDAG &DAG) {
17955 SDLoc DL(Mask);
17956 EVT InVT = Mask.getValueType();
17958
17959 auto Op1 = convertToScalableVector(DAG, ContainerVT, Mask);
17960 auto Op2 = DAG.getConstant(0, DL, ContainerVT);
17961 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
17962
17963 EVT CmpVT = Pg.getValueType();
17965 {Pg, Op1, Op2, DAG.getCondCode(ISD::SETNE)});
17966}
17967
17968// Convert all fixed length vector loads larger than NEON to masked_loads.
17969SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
17970 SDValue Op, SelectionDAG &DAG) const {
17971 auto Load = cast<MaskedLoadSDNode>(Op);
17972
17973 if (Load->getExtensionType() != ISD::LoadExtType::NON_EXTLOAD)
17974 return SDValue();
17975
17976 SDLoc DL(Op);
17977 EVT VT = Op.getValueType();
17979
17981
17983 bool IsPassThruZeroOrUndef = false;
17984
17985 if (Load->getPassThru()->isUndef()) {
17987 IsPassThruZeroOrUndef = true;
17988 } else {
17989 if (ContainerVT.isInteger())
17991 else
17993 if (isZerosVector(Load->getPassThru().getNode()))
17994 IsPassThruZeroOrUndef = true;
17995 }
17996
17997 auto NewLoad = DAG.getMaskedLoad(
17998 ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
17999 Mask, PassThru, Load->getMemoryVT(), Load->getMemOperand(),
18000 Load->getAddressingMode(), Load->getExtensionType());
18001
18002 if (!IsPassThruZeroOrUndef) {
18004 convertToScalableVector(DAG, ContainerVT, Load->getPassThru());
18006 }
18007
18008 auto Result = convertFromScalableVector(DAG, VT, NewLoad);
18009 SDValue MergedValues[2] = {Result, Load->getChain()};
18010 return DAG.getMergeValues(MergedValues, DL);
18011}
18012
18013// Convert all fixed length vector stores larger than NEON to masked_stores.
18014SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
18015 SDValue Op, SelectionDAG &DAG) const {
18016 auto Store = cast<StoreSDNode>(Op);
18017
18018 SDLoc DL(Op);
18019 EVT VT = Store->getValue().getValueType();
18021
18022 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
18023 return DAG.getMaskedStore(
18024 Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
18025 getPredicateForFixedLengthVector(DAG, DL, VT), Store->getMemoryVT(),
18026 Store->getMemOperand(), Store->getAddressingMode(),
18027 Store->isTruncatingStore());
18028}
18029
18030SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
18031 SDValue Op, SelectionDAG &DAG) const {
18032 auto Store = cast<MaskedStoreSDNode>(Op);
18033
18034 if (Store->isTruncatingStore())
18035 return SDValue();
18036
18037 SDLoc DL(Op);
18038 EVT VT = Store->getValue().getValueType();
18040
18041 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
18043
18044 return DAG.getMaskedStore(
18045 Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
18046 Mask, Store->getMemoryVT(), Store->getMemOperand(),
18047 Store->getAddressingMode(), Store->isTruncatingStore());
18048}
18049
18050SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
18051 SDValue Op, SelectionDAG &DAG) const {
18052 SDLoc dl(Op);
18053 EVT VT = Op.getValueType();
18055
18056 bool Signed = Op.getOpcode() == ISD::SDIV;
18058
18059 // Scalable vector i32/i64 DIV is supported.
18060 if (EltVT == MVT::i32 || EltVT == MVT::i64)
18061 return LowerToPredicatedOp(Op, DAG, PredOpcode, /*OverrideNEON=*/true);
18062
18063 // Scalable vector i8/i16 DIV is not supported. Promote it to i32.
18066 EVT FixedWidenedVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext());
18068
18069 // If this is not a full vector, extend, div, and truncate it.
18072 unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
18073 SDValue Op0 = DAG.getNode(ExtendOpcode, dl, WidenedVT, Op.getOperand(0));
18074 SDValue Op1 = DAG.getNode(ExtendOpcode, dl, WidenedVT, Op.getOperand(1));
18075 SDValue Div = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0, Op1);
18076 return DAG.getNode(ISD::TRUNCATE, dl, VT, Div);
18077 }
18078
18079 // Convert the operands to scalable vectors.
18080 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
18081 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
18082
18083 // Extend the scalable operands.
18086 SDValue Op0Lo = DAG.getNode(UnpkLo, dl, ScalableWidenedVT, Op0);
18087 SDValue Op1Lo = DAG.getNode(UnpkLo, dl, ScalableWidenedVT, Op1);
18088 SDValue Op0Hi = DAG.getNode(UnpkHi, dl, ScalableWidenedVT, Op0);
18089 SDValue Op1Hi = DAG.getNode(UnpkHi, dl, ScalableWidenedVT, Op1);
18090
18091 // Convert back to fixed vectors so the DIV can be further lowered.
18096 SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, FixedWidenedVT,
18097 Op0Lo, Op1Lo);
18098 SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, FixedWidenedVT,
18099 Op0Hi, Op1Hi);
18100
18101 // Convert again to scalable vectors to truncate.
18106
18108}
18109
18110SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
18111 SDValue Op, SelectionDAG &DAG) const {
18112 EVT VT = Op.getValueType();
18113 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
18114
18115 SDLoc DL(Op);
18116 SDValue Val = Op.getOperand(0);
18118 Val = convertToScalableVector(DAG, ContainerVT, Val);
18119
18120 bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND;
18122
18123 // Repeatedly unpack Val until the result is of the desired element type.
18124 switch (ContainerVT.getSimpleVT().SimpleTy) {
18125 default:
18126 llvm_unreachable("unimplemented container type");
18127 case MVT::nxv16i8:
18128 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val);
18129 if (VT.getVectorElementType() == MVT::i16)
18130 break;
18132 case MVT::nxv8i16:
18133 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val);
18134 if (VT.getVectorElementType() == MVT::i32)
18135 break;
18137 case MVT::nxv4i32:
18138 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val);
18139 assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!");
18140 break;
18141 }
18142
18143 return convertFromScalableVector(DAG, VT, Val);
18144}
18145
18146SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
18147 SDValue Op, SelectionDAG &DAG) const {
18148 EVT VT = Op.getValueType();
18149 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
18150
18151 SDLoc DL(Op);
18152 SDValue Val = Op.getOperand(0);
18154 Val = convertToScalableVector(DAG, ContainerVT, Val);
18155
18156 // Repeatedly truncate Val until the result is of the desired element type.
18157 switch (ContainerVT.getSimpleVT().SimpleTy) {
18158 default:
18159 llvm_unreachable("unimplemented container type");
18160 case MVT::nxv2i64:
18161 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val);
18162 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val);
18163 if (VT.getVectorElementType() == MVT::i32)
18164 break;
18166 case MVT::nxv4i32:
18167 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val);
18168 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val);
18169 if (VT.getVectorElementType() == MVT::i16)
18170 break;
18172 case MVT::nxv8i16:
18173 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val);
18174 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val);
18175 assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
18176 break;
18177 }
18178
18179 return convertFromScalableVector(DAG, VT, Val);
18180}
18181
18182SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt(
18183 SDValue Op, SelectionDAG &DAG) const {
18184 EVT VT = Op.getValueType();
18185 EVT InVT = Op.getOperand(0).getValueType();
18186 assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!");
18187
18188 SDLoc DL(Op);
18190 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
18191
18192 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Op.getOperand(1));
18193}
18194
18195SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt(
18196 SDValue Op, SelectionDAG &DAG) const {
18197 EVT VT = Op.getValueType();
18198 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
18199
18200 SDLoc DL(Op);
18201 EVT InVT = Op.getOperand(0).getValueType();
18203 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
18204
18206 Op.getOperand(1), Op.getOperand(2));
18207
18208 return convertFromScalableVector(DAG, VT, ScalableRes);
18209}
18210
18211// Convert vector operation 'Op' to an equivalent predicated operation whereby
18212// the original operation's type is used to construct a suitable predicate.
18213// NOTE: The results for inactive lanes are undefined.
18214SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
18215 SelectionDAG &DAG,
18216 unsigned NewOp,
18217 bool OverrideNEON) const {
18218 EVT VT = Op.getValueType();
18219 SDLoc DL(Op);
18220 auto Pg = getPredicateForVector(DAG, DL, VT);
18221
18222 if (useSVEForFixedLengthVectorVT(VT, OverrideNEON)) {
18224
18225 // Create list of operands by converting existing ones to scalable types.
18227 for (const SDValue &V : Op->op_values()) {
18228 if (isa<CondCodeSDNode>(V)) {
18229 Operands.push_back(V);
18230 continue;
18231 }
18232
18233 if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(V)) {
18234 EVT VTArg = VTNode->getVT().getVectorElementType();
18235 EVT NewVTArg = ContainerVT.changeVectorElementType(VTArg);
18236 Operands.push_back(DAG.getValueType(NewVTArg));
18237 continue;
18238 }
18239
18240 assert(useSVEForFixedLengthVectorVT(V.getValueType(), OverrideNEON) &&
18241 "Only fixed length vectors are supported!");
18242 Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
18243 }
18244
18246 Operands.push_back(DAG.getUNDEF(ContainerVT));
18247
18249 return convertFromScalableVector(DAG, VT, ScalableRes);
18250 }
18251
18252 assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
18253
18255 for (const SDValue &V : Op->op_values()) {
18256 assert((!V.getValueType().isVector() ||
18257 V.getValueType().isScalableVector()) &&
18258 "Only scalable vectors are supported!");
18259 Operands.push_back(V);
18260 }
18261
18263 Operands.push_back(DAG.getUNDEF(VT));
18264
18265 return DAG.getNode(NewOp, DL, VT, Operands);
18266}
18267
18268// If a fixed length vector operation has no side effects when applied to
18269// undefined elements, we can safely use scalable vectors to perform the same
18270// operation without needing to worry about predication.
18271SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
18272 SelectionDAG &DAG) const {
18273 EVT VT = Op.getValueType();
18274 assert(useSVEForFixedLengthVectorVT(VT) &&
18275 "Only expected to lower fixed length vector operation!");
18277
18278 // Create list of operands by converting existing ones to scalable types.
18280 for (const SDValue &V : Op->op_values()) {
18281 assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
18282
18283 // Pass through non-vector operands.
18284 if (!V.getValueType().isVector()) {
18285 Ops.push_back(V);
18286 continue;
18287 }
18288
18289 // "cast" fixed length vector to a scalable vector.
18290 assert(useSVEForFixedLengthVectorVT(V.getValueType()) &&
18291 "Only fixed length vectors are supported!");
18292 Ops.push_back(convertToScalableVector(DAG, ContainerVT, V));
18293 }
18294
18295 auto ScalableRes = DAG.getNode(Op.getOpcode(), SDLoc(Op), ContainerVT, Ops);
18296 return convertFromScalableVector(DAG, VT, ScalableRes);
18297}
18298
18299SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
18300 SelectionDAG &DAG) const {
18301 SDLoc DL(ScalarOp);
18302 SDValue AccOp = ScalarOp.getOperand(0);
18303 SDValue VecOp = ScalarOp.getOperand(1);
18304 EVT SrcVT = VecOp.getValueType();
18305 EVT ResVT = SrcVT.getVectorElementType();
18306
18308 if (SrcVT.isFixedLengthVector()) {
18311 }
18312
18314 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
18315
18316 // Convert operands to Scalable.
18318 DAG.getUNDEF(ContainerVT), AccOp, Zero);
18319
18320 // Perform reduction.
18322 Pg, AccOp, VecOp);
18323
18324 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero);
18325}
18326
18327SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
18328 SelectionDAG &DAG) const {
18329 SDLoc DL(ReduceOp);
18330 SDValue Op = ReduceOp.getOperand(0);
18331 EVT OpVT = Op.getValueType();
18332 EVT VT = ReduceOp.getValueType();
18333
18334 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
18335 return SDValue();
18336
18338
18339 switch (ReduceOp.getOpcode()) {
18340 default:
18341 return SDValue();
18342 case ISD::VECREDUCE_OR:
18343 return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE);
18344 case ISD::VECREDUCE_AND: {
18345 Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg);
18346 return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE);
18347 }
18348 case ISD::VECREDUCE_XOR: {
18349 SDValue ID =
18350 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64);
18351 SDValue Cntp =
18352 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op);
18353 return DAG.getAnyExtOrTrunc(Cntp, DL, VT);
18354 }
18355 }
18356
18357 return SDValue();
18358}
18359
18360SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
18362 SelectionDAG &DAG) const {
18363 SDLoc DL(ScalarOp);
18364 SDValue VecOp = ScalarOp.getOperand(0);
18365 EVT SrcVT = VecOp.getValueType();
18366
18367 if (useSVEForFixedLengthVectorVT(SrcVT, true)) {
18370 }
18371
18372 // UADDV always returns an i64 result.
18373 EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
18374 SrcVT.getVectorElementType();
18375 EVT RdxVT = SrcVT;
18376 if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED)
18378
18380 SDValue Rdx = DAG.getNode(Opcode, DL, RdxVT, Pg, VecOp);
18382 Rdx, DAG.getConstant(0, DL, MVT::i64));
18383
18384 // The VEC_REDUCE nodes expect an element size result.
18385 if (ResVT != ScalarOp.getValueType())
18386 Res = DAG.getAnyExtOrTrunc(Res, DL, ScalarOp.getValueType());
18387
18388 return Res;
18389}
18390
18391SDValue
18392AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
18393 SelectionDAG &DAG) const {
18394 EVT VT = Op.getValueType();
18395 SDLoc DL(Op);
18396
18397 EVT InVT = Op.getOperand(1).getValueType();
18399 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1));
18400 SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2));
18401
18402 // Convert the mask to a predicated (NOTE: We don't need to worry about
18403 // inactive lanes since VSELECT is safe when given undefined elements).
18404 EVT MaskVT = Op.getOperand(0).getValueType();
18406 auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0));
18408 MaskContainerVT.changeVectorElementType(MVT::i1), Mask);
18409
18411 Mask, Op1, Op2);
18412
18413 return convertFromScalableVector(DAG, VT, ScalableRes);
18414}
18415
18416SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
18417 SDValue Op, SelectionDAG &DAG) const {
18418 SDLoc DL(Op);
18419 EVT InVT = Op.getOperand(0).getValueType();
18421
18422 assert(useSVEForFixedLengthVectorVT(InVT) &&
18423 "Only expected to lower fixed length vector operation!");
18424 assert(Op.getValueType() == InVT.changeTypeToInteger() &&
18425 "Expected integer result of the same bit length as the inputs!");
18426
18427 auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
18428 auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
18429 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
18430
18431 EVT CmpVT = Pg.getValueType();
18433 {Pg, Op1, Op2, Op.getOperand(2)});
18434
18435 EVT PromoteVT = ContainerVT.changeTypeToInteger();
18436 auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, PromoteVT, InVT);
18437 return convertFromScalableVector(DAG, Op.getValueType(), Promote);
18438}
18439
18440SDValue
18441AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op,
18442 SelectionDAG &DAG) const {
18443 SDLoc DL(Op);
18444 auto SrcOp = Op.getOperand(0);
18445 EVT VT = Op.getValueType();
18448 getContainerForFixedLengthVector(DAG, SrcOp.getValueType());
18449
18452 return convertFromScalableVector(DAG, VT, Op);
18453}
18454
18455SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(
18456 SDValue Op, SelectionDAG &DAG) const {
18457 SDLoc DL(Op);
18458 unsigned NumOperands = Op->getNumOperands();
18459
18460 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
18461 "Unexpected number of operands in CONCAT_VECTORS");
18462
18463 auto SrcOp1 = Op.getOperand(0);
18464 auto SrcOp2 = Op.getOperand(1);
18465 EVT VT = Op.getValueType();
18466 EVT SrcVT = SrcOp1.getValueType();
18467
18468 if (NumOperands > 2) {
18470 EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());
18471 for (unsigned I = 0; I < NumOperands; I += 2)
18472 Ops.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, PairVT,
18473 Op->getOperand(I), Op->getOperand(I + 1)));
18474
18475 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
18476 }
18477
18479
18483
18485
18486 return convertFromScalableVector(DAG, VT, Op);
18487}
18488
18489SDValue
18490AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op,
18491 SelectionDAG &DAG) const {
18492 EVT VT = Op.getValueType();
18493 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
18494
18495 SDLoc DL(Op);
18496 SDValue Val = Op.getOperand(0);
18497 SDValue Pg = getPredicateForVector(DAG, DL, VT);
18498 EVT SrcVT = Val.getValueType();
18500 EVT ExtendVT = ContainerVT.changeVectorElementType(
18501 SrcVT.getVectorElementType());
18502
18503 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
18504 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT.changeTypeToInteger(), Val);
18505
18506 Val = convertToScalableVector(DAG, ContainerVT.changeTypeToInteger(), Val);
18507 Val = getSVESafeBitCast(ExtendVT, Val, DAG);
18509 Pg, Val, DAG.getUNDEF(ContainerVT));
18510
18511 return convertFromScalableVector(DAG, VT, Val);
18512}
18513
18514SDValue
18515AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op,
18516 SelectionDAG &DAG) const {
18517 EVT VT = Op.getValueType();
18518 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
18519
18520 SDLoc DL(Op);
18521 SDValue Val = Op.getOperand(0);
18522 EVT SrcVT = Val.getValueType();
18524 EVT RoundVT = ContainerSrcVT.changeVectorElementType(
18527
18528 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
18530 Op.getOperand(1), DAG.getUNDEF(RoundVT));
18531 Val = getSVESafeBitCast(ContainerSrcVT.changeTypeToInteger(), Val, DAG);
18532 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
18533
18534 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
18535 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
18536}
18537
18538SDValue
18539AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op,
18540 SelectionDAG &DAG) const {
18541 EVT VT = Op.getValueType();
18542 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
18543
18544 bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP;
18545 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
18547
18548 SDLoc DL(Op);
18549 SDValue Val = Op.getOperand(0);
18550 EVT SrcVT = Val.getValueType();
18553
18554 if (ContainerSrcVT.getVectorElementType().getSizeInBits() <=
18555 ContainerDstVT.getVectorElementType().getSizeInBits()) {
18556 SDValue Pg = getPredicateForVector(DAG, DL, VT);
18557
18558 Val = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
18559 VT.changeTypeToInteger(), Val);
18560
18561 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
18562 Val = getSVESafeBitCast(ContainerDstVT.changeTypeToInteger(), Val, DAG);
18563 // Safe to use a larger than specified operand since we just unpacked the
18564 // data, hence the upper bits are zero.
18565 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
18567 return convertFromScalableVector(DAG, VT, Val);
18568 } else {
18569 EVT CvtVT = ContainerSrcVT.changeVectorElementType(
18570 ContainerDstVT.getVectorElementType());
18572
18573 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
18574 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
18575 Val = getSVESafeBitCast(ContainerSrcVT, Val, DAG);
18576 Val = convertFromScalableVector(DAG, SrcVT, Val);
18577
18578 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
18579 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
18580 }
18581}
18582
18583SDValue
18584AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
18585 SelectionDAG &DAG) const {
18586 EVT VT = Op.getValueType();
18587 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
18588
18589 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
18590 unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU
18592
18593 SDLoc DL(Op);
18594 SDValue Val = Op.getOperand(0);
18595 EVT SrcVT = Val.getValueType();
18598
18599 if (ContainerSrcVT.getVectorElementType().getSizeInBits() <=
18600 ContainerDstVT.getVectorElementType().getSizeInBits()) {
18601 EVT CvtVT = ContainerDstVT.changeVectorElementType(
18602 ContainerSrcVT.getVectorElementType());
18603 SDValue Pg = getPredicateForVector(DAG, DL, VT);
18604
18605 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
18606 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Val);
18607
18608 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
18609 Val = getSVESafeBitCast(CvtVT, Val, DAG);
18610 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
18612 return convertFromScalableVector(DAG, VT, Val);
18613 } else {
18614 EVT CvtVT = ContainerSrcVT.changeTypeToInteger();
18616
18617 // Safe to use a larger than specified result since an fp_to_int where the
18618 // result doesn't fit into the destination is undefined.
18619 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
18620 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
18621 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
18622
18623 return DAG.getNode(ISD::TRUNCATE, DL, VT, Val);
18624 }
18625}
18626
18627SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
18628 SDValue Op, SelectionDAG &DAG) const {
18629 EVT VT = Op.getValueType();
18630 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
18631
18632 auto *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
18633 auto ShuffleMask = SVN->getMask();
18634
18635 SDLoc DL(Op);
18636 SDValue Op1 = Op.getOperand(0);
18637 SDValue Op2 = Op.getOperand(1);
18638
18640 Op1 = convertToScalableVector(DAG, ContainerVT, Op1);
18641 Op2 = convertToScalableVector(DAG, ContainerVT, Op2);
18642
18643 bool ReverseEXT = false;
18644 unsigned Imm;
18645 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm) &&
18646 Imm == VT.getVectorNumElements() - 1) {
18647 if (ReverseEXT)
18648 std::swap(Op1, Op2);
18649
18650 EVT ScalarTy = VT.getVectorElementType();
18651 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
18652 ScalarTy = MVT::i32;
18653 SDValue Scalar = DAG.getNode(
18654 ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
18656 Op = DAG.getNode(AArch64ISD::INSR, DL, ContainerVT, Op2, Scalar);
18657 return convertFromScalableVector(DAG, VT, Op);
18658 }
18659
18660 return SDValue();
18661}
18662
18663SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
18664 SelectionDAG &DAG) const {
18665 SDLoc DL(Op);
18666 EVT InVT = Op.getValueType();
18667 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18668 (void)TLI;
18669
18670 assert(VT.isScalableVector() && TLI.isTypeLegal(VT) &&
18671 InVT.isScalableVector() && TLI.isTypeLegal(InVT) &&
18672 "Only expect to cast between legal scalable vector types!");
18674 (InVT.getVectorElementType() == MVT::i1) &&
18675 "Cannot cast between data and predicate scalable vector types!");
18676
18677 if (InVT == VT)
18678 return Op;
18679
18680 if (VT.getVectorElementType() == MVT::i1)
18681 return DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
18682
18684 EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType());
18685
18686 // Pack input if required.
18687 if (InVT != PackedInVT)
18689
18690 Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
18691
18692 // Unpack result if required.
18693 if (VT != PackedVT)
18694 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
18695
18696 return Op;
18697}
18698
18700 return ::isAllActivePredicate(N);
18701}
18702
18704 return ::getPromotedVTForPredicate(VT);
18705}
18706
18707bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
18709 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
18710 unsigned Depth) const {
18711
18712 unsigned Opc = Op.getOpcode();
18713 switch (Opc) {
18714 case AArch64ISD::VSHL: {
18715 // Match (VSHL (VLSHR Val X) X)
18716 SDValue ShiftL = Op;
18717 SDValue ShiftR = Op->getOperand(0);
18718 if (ShiftR->getOpcode() != AArch64ISD::VLSHR)
18719 return false;
18720
18721 if (!ShiftL.hasOneUse() || !ShiftR.hasOneUse())
18722 return false;
18723
18724 unsigned ShiftLBits = ShiftL->getConstantOperandVal(1);
18725 unsigned ShiftRBits = ShiftR->getConstantOperandVal(1);
18726
18727 // Other cases can be handled as well, but this is not
18728 // implemented.
18729 if (ShiftRBits != ShiftLBits)
18730 return false;
18731
18732 unsigned ScalarSize = Op.getScalarValueSizeInBits();
18733 assert(ScalarSize > ShiftLBits && "Invalid shift imm");
18734
18737
18738 if ((ZeroBits & UnusedBits) != ZeroBits)
18739 return false;
18740
18741 // All bits that are zeroed by (VSHL (VLSHR Val X) X) are not
18742 // used - simplify to just Val.
18743 return TLO.CombineTo(Op, ShiftR->getOperand(0));
18744 }
18745 }
18746
18749}
18750
18751bool AArch64TargetLowering::isConstantUnsignedBitfieldExtactLegal(
18752 unsigned Opc, LLT Ty1, LLT Ty2) const {
18753 return Ty1 == Ty2 && (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64));
18754}
unsigned const MachineRegisterInfo * MRI
static unsigned MatchRegisterName(StringRef Name)
static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG)
NarrowVector - Given a value in the V128 register class, produce the equivalent value in the V64 regi...
static bool isConcatMask(ArrayRef< int > Mask, EVT VT, bool SplitLHS)
static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG)
static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, AArch64CC::CondCode CC, bool NoNans, EVT VT, const SDLoc &dl, SelectionDAG &DAG)
static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue CCOp, AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC, const SDLoc &DL, SelectionDAG &DAG)
can be transformed to: not (and (not (and (setCC (cmp C)) (setCD (cmp D)))) (and (not (setCA (cmp A))...
static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2, bool &Invert)
changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC usable with the vector...
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt)
isVShiftRImm - Check if this is a valid build_vector for the immediate operand of a vector shift righ...
static bool isSingletonEXTMask(ArrayRef< int > M, EVT VT, unsigned &Imm)
static SDValue performCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex)
static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue NormalizeBuildVector(SDValue Op, SelectionDAG &DAG)
static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of zeros to a vector store by scalar stores of WZR/XZR.
static SDValue GenerateTBL(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static std::pair< SDValue, SDValue > splitInt128(SDValue N, SelectionDAG &DAG)
static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo)
Check whether or not Op is a SET_CC operation, either a generic or an AArch64 lowered one.
static bool isLegalArithImmed(uint64_t C)
static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT)
static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG)
unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend)
static SDValue convertFixedMaskToScalableVector(SDValue Mask, SelectionDAG &DAG)
static bool areExtractShuffleVectors(Value *Op1, Value *Op2)
Check if both Op1 and Op2 are shufflevector extracts of either the lower or upper half of the vector ...
unsigned getScatterVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend)
static bool isREVMask(ArrayRef< int > M, EVT VT, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
static SDValue performSRLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isZerosVector(const SDNode *N)
isZerosVector - Check whether SDNode N is a zero-filled vector.
static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Fold a floating-point divide by power of two into fixed-point to floating-point conversion.
static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc, SelectionDAG &DAG, bool UnpredOp=false)
static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset, SDLoc DL, unsigned BitWidth)
static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG)
static bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType)
static SDValue performSVEAndCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performBRCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static MVT getSVEContainerType(EVT ContentTy)
static bool isMergePassthruOpcode(unsigned Opc)
static EVT calculatePreExtendType(SDValue Extend, SelectionDAG &DAG)
Calculates what the pre-extend type is, based on the extension operation node provided by Extend.
static SDValue performNEONPostLDSTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Target-specific DAG combine function for NEON load/store intrinsics to merge base address updates.
static void ReplaceCMP_SWAP_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp, SelectionDAG &DAG)
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
static SDValue performSelectCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with the compare-mask instruct...
static cl::opt< bool > EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden, cl::desc("Enable AArch64 logical imm instruction " "optimization"), cl::init(true))
static bool isUZPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes, unsigned ScalarSizeInBytes)
Check if the value of OffsetInBytes can be used as an immediate for the gather load/prefetch and scat...
static bool isUZP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of "vector_shuffle v,...
static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
#define LCALLNAME4(A, B)
static unsigned getDUPLANEOp(EVT EltType)
static void changeFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
static bool isTRNMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget, const TargetMachine &TM)
static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST, EVT VT, EVT MemVT, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static bool isLanes1toNKnownZero(SDValue Op)
static bool setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL, AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI)
Set the IntrinsicInfo for the aarch64_sve_st<N> intrinsics.
static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG)
static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N, SelectionDAG &DAG)
static SDValue performNVCASTCombine(SDNode *N)
Get rid of unnecessary NVCASTs (that don't change the type).
static EVT getPackedSVEVectorVT(EVT VT)
static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG)
static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performCSELCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static void ReplaceReductionResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, unsigned InterOp, unsigned AcrossOp)
static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount, bool &FromHi)
An EXTR instruction is made up of two shifts, ORed together.
static bool isEquivalentMaskless(unsigned CC, unsigned width, ISD::LoadExtType ExtType, int AddConstant, int CompConstant)
static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG)
static EVT getExtensionTo64Bits(const EVT &OrigVT)
static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
SDValue performSVESpliceCombine(SDNode *N, SelectionDAG &DAG)
static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp, AArch64CC::CondCode Predicate)
Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain of CCMP/CFCMP ops.
static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT, unsigned Opcode, SelectionDAG &DAG)
static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, SelectionDAG &DAG)
static bool isINSMask(ArrayRef< int > M, int NumInputElements, bool &DstIsLeft, int &Anomaly)
static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, APInt &UndefBits)
static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG)
static unsigned getIntrinsicID(const SDNode *N)
static bool IsSVECntIntrinsic(SDValue S)
static SDValue performANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool canEmitConjunction(const SDValue Val, bool &CanNegate, bool &MustBeFirst, bool WillNegate, unsigned Depth=0)
Returns true if Val is a tree of AND/OR/SETCC operations that can be expressed as a conjunction.
static bool isWideDUPMask(ArrayRef< int > M, EVT VT, unsigned BlockSize, unsigned &DupLaneOp)
Check if a vector shuffle corresponds to a DUP instructions with a larger element width than the vect...
static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St, SDValue SplatVal, unsigned NumVecElts)
static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, bool isSigned)
static SDValue performSetccMergeZeroCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *ST)
static SDValue performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue removeRedundantInsertVectorElt(SDNode *N)
static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG)
Legalize the gather prefetch (scalar + vector addressing mode) when the offset vector is an unpacked ...
static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG)
static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc)
static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16)
static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG)
static SDValue lowerConvertToSVBool(SDValue Op, SelectionDAG &DAG)
static SDValue performVectorShiftCombine(SDNode *N, const AArch64TargetLowering &TLI, TargetLowering::DAGCombinerInfo &DCI)
Optimize a vector shift instruction and its operand if shifted out bits are not used.
static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG, unsigned ScalarSizeInBytes)
Combines a node carrying the intrinsic aarch64_sve_prf<T>_gather_scalar_offset into a node that uses ...
static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of a scalar to a vector store by scalar stores of the scalar value.
unsigned getSignExtendedGatherOpcode(unsigned Opcode)
static SDValue performAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt)
getVShiftImm - Check if this is a valid build_vector for the immediate operand of a vector shift oper...
static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC)
changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 CC
static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
void selectGatherScatterAddrMode(SDValue &BasePtr, SDValue &Index, EVT MemVT, unsigned &Opcode, bool IsGather, SelectionDAG &DAG)
static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, const APInt &Demanded, TargetLowering::TargetLoweringOpt &TLO, unsigned NewOpc)
static unsigned getCmpOperandFoldingProfit(SDValue Op)
Returns how profitable it is to fold a comparison's operand's shift and/or extension operations.
static SDValue performConcatVectorsCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N, SelectionDAG &DAG)
#define MAKE_CASE(V)
static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG, const EVT &OrigTy, const EVT &ExtTy, unsigned ExtOpcode)
static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG)
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT, int Pattern)
static bool isEXTMask(ArrayRef< int > M, EVT VT, bool &ReverseEXT, unsigned &Imm)
static bool isAllActivePredicate(SDValue N)
static SDValue tryCombineFixedPointConvert(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG)
Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup)) making use of the vector SExt/ZE...
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue performVectorTruncateCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Fold a floating-point multiply by power of two into floating-point to fixed-point conversion.
static EVT getPromotedVTForPredicate(EVT VT)
static void changeFPCCToANDAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
Convert a DAG fp condition code to an AArch64 CC.
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG)
static bool isAllConstantBuildVector(const SDValue &PotentialBVec, uint64_t &ConstVal)
static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG)
static Value * UseTlsOffset(IRBuilderBase &IRB, unsigned Offset)
static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG)
WidenVector - Given a value in the V64 register class, produce the equivalent value in the V128 regis...
static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG)
static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op, AArch64CC::CondCode Cond)
static bool isSetCCOrZExtSetCC(const SDValue &Op, SetCCInfoAndKind &Info)
cl::opt< bool > EnableAArch64ELFLocalDynamicTLSGeneration("aarch64-elf-ldtls-generation", cl::Hidden, cl::desc("Allow AArch64 Local Dynamic TLS code generation"), cl::init(false))
static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &dl)
GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit the specified operations t...
static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG)
static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performPostLD1Combine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, bool IsLaneOp)
Target-specific DAG combine function for post-increment LD1 (lane) and post-increment LD1R.
static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2)
Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
std::pair< SDValue, uint64_t > lookThroughSignExtension(SDValue Val)
static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG)
static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static PredicateConstraint parsePredicateConstraint(StringRef Constraint)
static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG)
static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V)
static bool isPackedVectorType(EVT VT, SelectionDAG &DAG)
Returns true if VT's elements occupy the lowest bit positions of its associated register class withou...
static SDValue performAddSubLongCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isTRN_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of "vector_shuffle v,...
static bool isZIPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue performSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG)
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt)
isVShiftLImm - Check if this is a valid build_vector for the immediate operand of a vector shift left...
static SDValue performExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert, SelectionDAG &DAG)
static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl, SelectionDAG &DAG, SDValue Chain, bool IsSignaling)
static bool isSignExtended(SDNode *N, SelectionDAG &DAG)
static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG)
bool getGatherScatterIndexIsExtended(SDValue Index)
static bool isZeroExtended(SDNode *N, SelectionDAG &DAG)
static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static bool isCMN(SDValue Op, ISD::CondCode CC)
static SDValue tryCombineToEXTR(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
EXTR instruction extracts a contiguous chunk of bits from two existing registers viewed as a high/low...
static bool isOperandOfVmullHighP64(Value *Op)
Check if Op could be used with vmull_high_p64 intrinsic.
static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode, SDValue Operand, SelectionDAG &DAG, int &ExtraSteps)
static bool isEssentiallyExtractHighSubvector(SDValue N)
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static unsigned getExtFactor(SDValue &V)
getExtFactor - Determine the adjustment factor for the position when generating an "extract from vect...
static SDValue performExtractVectorEltCombine(SDNode *N, SelectionDAG &DAG)
#define LCALLNAME5(A, B)
static SDValue performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG)
static const MVT MVT_CC
Value type used for condition codes.
static SDValue performCommonVectorExtendCombine(SDValue VectorShuffle, SelectionDAG &DAG)
Combines a dup(sext/zext) node pattern into sext/zext(dup) making use of the vector SExt/ZExt rather ...
static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performTBZCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC)
Emit expression as a conjunction (a series of CCMP/CFCMP ops).
static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N)
static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AArch64cc, SelectionDAG &DAG, const SDLoc &dl)
static bool performTBISimplification(SDValue Addr, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Simplify Addr given that the top byte of it is ignored by HW during address translation.
static SDValue performIntrinsicCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static cl::opt< bool > EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden, cl::desc("Combine extends of AArch64 masked " "gather intrinsics"), cl::init(true))
static bool isZIP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of "vector_shuffle v,...
static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static std::pair< SDValue, SDValue > getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG)
#define FALKOR_STRIDED_ACCESS_MD
static const unsigned PerfectShuffleTable[6561+1]
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static bool isConstant(const MachineInstr &MI)
amdgpu Simplify well known AMD library false FunctionCallee Callee
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
@ Scaled
static const MCPhysReg GPRArgRegs[]
Function Alias Analysis Results
assume Assume Builder
This file contains the simple types necessary to represent the attributes associated with functions a...
SmallVector< MachineOperand, 4 > Cond
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< ShadowStackGC > C("shadow-stack", "Very portable GC for uncooperative code generators")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_FALLTHROUGH
LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
Definition Compiler.h:281
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool isConstantSplatVectorMaskForType(SDNode *N, EVT ScalarTy)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
PropagateLiveness Given that RA is a live propagate it s liveness to any other values it uses(according to Uses). void DeadArgumentEliminationPass
else return RetTy
#define LLVM_DEBUG(X)
Definition Debug.h:122
uint64_t Align
uint64_t Offset
uint64_t Addr
uint32_t Index
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
#define im(i)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
#define RegName(no)
lazy value info
#define F(x, y, z)
Definition MD5.cpp:56
#define I(x, y, z)
Definition MD5.cpp:59
#define G(x, y, z)
Definition MD5.cpp:57
mir Rename Register Operands
unsigned const TargetRegisterInfo * TRI
unsigned Reg
Module.h This file contains the declarations for the Module class.
This file defines ARC utility functions which are used by various parts of the compiler.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
StandardInstrumentations SI(Debug, VerifyEach)
if(VerifyEach)
const char LLVMTargetMachineRef LLVMPassBuilderOptionsRef Options
const char LLVMTargetMachineRef TM
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool Enabled
Definition Statistic.cpp:46
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:166
static const int BlockSize
Definition TarWriter.cpp:33
This defines the Use class.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:455
static constexpr int Concat[]
AArch64FunctionInfo - This class is derived from MachineFunctionInfo and contains private AArch64-spe...
void setTailCallReservedStack(unsigned bytes)
SmallVectorImpl< ForwardedRegister > & getForwardedMustTailRegParms()
void setBytesInStackArgArea(unsigned bytes)
void setHasSwiftAsyncContext(bool HasContext)
void setJumpTableEntryInfo(int Idx, unsigned Size, MCSymbol *PCRelSym)
void setArgumentStackToRestore(unsigned bytes)
void UpdateCustomCalleeSavedRegs(MachineFunction &MF) const
static bool hasSVEArgsOrReturn(const MachineFunction *MF)
unsigned getPrefLoopLogAlignment() const
const AArch64RegisterInfo * getRegisterInfo() const override
unsigned getPrefFunctionLogAlignment() const
bool isMisaligned128StoreSlow() const
const AArch64InstrInfo * getInstrInfo() const override
unsigned getMaximumJumpTableSize() const
ARMProcFamilyEnum getProcFamily() const
Returns ARM processor family.
unsigned classifyGlobalFunctionReference(const GlobalValue *GV, const TargetMachine &TM) const
bool supportsAddressTopByteIgnored() const
CPU has TBI (top byte of addresses is ignored during HW address translation) and OS enables it.
const Triple & getTargetTriple() const
bool isCallingConvWin64(CallingConv::ID CC) const
unsigned ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const
ClassifyGlobalReference - Find the target operand flags that describe how a global value should be re...
bool isXRegisterReserved(size_t i) const
bool predictableSelectIsExpensive() const
bool useSVEForFixedLengthVectors() const
unsigned getMinSVEVectorSizeInBits() const
bool hasCustomCallingConv() const
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type FromTy to type ToTy.
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
bool isShuffleMaskLegal(ArrayRef< int > M, EVT VT) const override
Return true if the given shuffle mask can be codegen'd directly, or if it should be stack expanded.
unsigned getVaListSizeInBits(const DataLayout &DL) const override
Returns the size of the platform's va_list object.
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns true if the given (atomic) store should be expanded by the IR-level AtomicExpand pass into an...
bool generateFMAsInMachineCombiner(EVT VT, CodeGenOpt::Level OptLevel) const override
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, bool *Fast=nullptr) const override
Returns true if the target allows unaligned memory accesses of the specified type.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC) const
Selects the correct CCAssignFn for a given CallingConvention value.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ISD::SETCC ValueType.
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const
Selects the correct CCAssignFn for a given CallingConvention value.
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalICmpImmediate(int64_t) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
Value * emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override
Perform a store-conditional operation to Addr.
bool preferIncOfAddToSubOfNot(EVT VT) const override
These two forms are equivalent: sub y, (xor x, -1) add (add x, 1), y The variant with two add's is IR...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
bool lowerInterleavedLoad(LoadInst *LI, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor) const override
Lower an interleaved load into a ldN intrinsic.
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool shouldSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
bool fallBackToDAGISel(const Instruction &Inst) const override
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
getTgtMemIntrinsic - Represent NEON load and store intrinsics as MemIntrinsicNodes.
Function * getSSPStackGuardCheck(const Module &M) const override
If the target has a standard stack protection check function that performs validation and error handl...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL) const
Returns true if VecTy is a legal interleaved access type.
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
void insertSSPDeclarations(Module &M) const override
Inserts necessary declarations for SSP (stack protection) purpose.
bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const override
For some targets, an LLVM struct type must be broken down into multiple simple types,...
Value * emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr, AtomicOrdering Ord) const override
Perform a load-linked operation on Addr, returning a "Value *" with the corresponding pointee type.
MachineBasicBlock * EmitLoweredCatchRet(MachineInstr &MI, MachineBasicBlock *BB) const
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const override
SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool useLoadStackGuardNode() const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
Value * getSafeStackPointerLocation(IRBuilderBase &IRB) const override
If the target has a standard location for the unsafe stack pointer, returns the address of that locat...
InstructionCost getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const override
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
bool isProfitableToHoist(Instruction *I) const override
Check if it is profitable to hoist instruction in then/else to if.
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const override
Return true if it is profitable to reduce a load to a smaller type.
MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const override
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override
Lower an interleaved store into a stN intrinsic.
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
MachineBasicBlock * EmitF128CSEL(MachineInstr &MI, MachineBasicBlock *BB) const
LLT getOptimalMemOpLLT(const MemOp &Op, const AttributeList &FuncAttributes) const override
LLT returning variant.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool needsFixedCatchObjects() const override
Used for exception handling on Win64.
Value * getIRStackGuard(IRBuilderBase &IRB) const override
If the target has a standard location for the stack protector cookie, returns the address of that loc...
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
bool hasPairedLoad(EVT LoadedType, Align &RequiredAligment) const override
Return true if the target supplies and combines to a paired load two loaded values of type LoadedType...
AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI)
bool isLegalAddImmediate(int64_t) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool shouldConsiderGEPOffsetSplit() const override
const MCPhysReg * getScratchRegisters(CallingConv::ID CC) const override
Returns a 0 terminated array of registers that can be safely used as scratch registers.
bool isAllActivePredicate(SDValue N) const
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool enableAggressiveFMAFusion(EVT VT) const override
Enable aggressive FMA fusion on targets that want it.
Value * getSDagStackGuard(const Module &M) const override
Return the variable that's previously inserted by insertSSPDeclarations, if any, otherwise return nul...
MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override
EVT is not used in-tree, but is used by out-of-tree target.
bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override
Return true if SHIFT instructions should be expanded to SHIFT_PARTS instructions, and false if a libr...
bool mergeStoresAfterLegalization(EVT VT) const override
SVE code generation for fixed length vectors does not custom lower BUILD_VECTOR.
APInt bitcastToAPInt() const
Definition APFloat.h:1132
bool isPosZero() const
Definition APFloat.h:1228
void dump() const
Definition APFloat.cpp:4862
Class for arbitrary precision integers.
Definition APInt.h:70
APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition APInt.cpp:952
static APInt getAllOnesValue(unsigned numBits)
Get the all-ones value.
Definition APInt.h:567
unsigned countTrailingZeros() const
Count the number of trailing zero bits.
Definition APInt.h:1700
unsigned logBase2() const
Definition APInt.h:1811
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition APInt.h:369
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:469
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Get a value with low bits set.
Definition APInt.h:667
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Get a value with high bits set.
Definition APInt.h:655
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1643
an instruction to allocate memory on the stack
This class represents an incoming formal argument to a Function.
Definition Argument.h:29
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
@ Min
*p = old <signed v ? old : v
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ UMax
*p = old >unsigned v ? old : v
@ Nand
*p = ~(old & v)
bool isFloatingPointOperation() const
BinOp getOperation() const
This is an SDNode representing atomic operations.
bool hasFnAttribute(Attribute::AttrKind Kind) const
Equivalent to hasAttribute(AttributeList::FunctionIndex, Kind) but may be faster.
LLVM Basic Block Representation.
Definition BasicBlock.h:59
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:107
const BlockAddress * getBlockAddress() const
A "pseudo-class" with methods for operating on BUILD_VECTORs.
CCState - This class holds information needed while lowering arguments and return values.
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
unsigned getNextStackOffset() const
getNextStackOffset - Return the next stack offset such that all stack slots satisfy their alignment r...
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
CCValAssign - Represent assignment of one arg/retval to a location.
Value * getArgOperand(unsigned i) const
unsigned getNumArgOperands() const
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
This is the shared class of boolean and integer constants.
Definition Constants.h:79
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:133
uint64_t getZExtValue() const
This is an important base class in LLVM.
Definition Constant.h:41
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:112
bool isBigEndian() const
Definition DataLayout.h:242
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition DataLayout.h:498
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
A debug info location.
Definition DebugLoc.h:33
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition FastISel.h:65
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:650
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:714
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:239
Constant * getPersonalityFn() const
Get the personality function associated with this function.
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:250
const Function & getFunction() const
Definition Function.h:136
arg_iterator arg_begin()
Definition Function.h:794
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.h:356
bool isThreadLocal() const
If the value is "Thread Local", its value isn't shared by the threads.
bool hasExternalWeakLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
Common base class shared among various IRBuilders.
Definition IRBuilder.h:95
Value * CreateConstGEP1_32(Type *Ty, Value *Ptr, unsigned Idx0, const Twine &Name="")
Definition IRBuilder.h:1857
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=None, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2391
Value * CreatePointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2159
BasicBlock * GetInsertBlock() const
Definition IRBuilder.h:178
PointerType * getInt8PtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer to an 8-bit integer value.
Definition IRBuilder.h:561
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition IRBuilder.h:508
This instruction inserts a single (scalar) element into a VectorType value.
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
static LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
static ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:287
LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:361
ScalarTy getKnownMinValue() const
Returns the minimum value this size can represent.
Definition TypeSize.h:297
static ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:284
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getPointerOperand()
This class is used to represent ISD::LOAD nodes.
MCRegisterInfo base class - We assume that the target defines a static array of MCRegisterDesc object...
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool isFixedLengthVector() const
static MVT getVectorVT(MVT VT, unsigned NumElements)
static auto fp_scalable_vector_valuetypes()
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
static auto fp_fixedlen_vector_valuetypes()
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
void setFrameAddressIsTaken(bool T)
void setStackID(int ObjectIdx, uint8_t ID)
void setHasTailCall(bool V=true)
bool hasMustTailInVarArgFunc() const
Returns true if the function is variadic and contains a musttail call.
void setReturnAddressIsTaken(bool s)
void computeMaxCallFrameSize(const MachineFunction &MF)
Computes the maximum size of a callframe and the AdjustsStack property.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *bb=nullptr)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
void insert(iterator MBBI, MachineBasicBlock *MBB)
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
An SDNode that represents everything that will be needed to construct a MachineInstr.
This class is used to represent an MGATHER node.
This class is used to represent an MLOAD node.
This class is used to represent an MSCATTER node.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
bool isVolatile() const
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
unsigned getAlignment() const
EVT getMemoryVT() const
Return the type of the in-memory value.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.cpp:401
void dump() const
Definition Pass.cpp:131
Class to represent pointers.
Type * getElementType() const
Wrapper class representing virtual and physical registers.
Definition Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
unsigned getNumOperands() const
Return the number of values used by this operation.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
uint64_t getScalarValueSizeInBits() const
uint64_t getConstantOperandVal(unsigned i) const
unsigned getOpcode() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS)
Helper function to make it easier to build Select's if you just have operands and don't want to check...
const TargetSubtargetInfo & getSubtarget() const
SDValue getVScale(const SDLoc &DL, EVT VT, APInt MulImm)
Return a node that represents the runtime scaling 'MulImm * RuntimeVL'.
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getSplatValue(SDValue V, bool LegalTypes=false)
If V is a splat vector, return its scalar source operand by extracting that element from the source v...
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, bool isTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=None, int Offset=0, unsigned TargetFlags=0)
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
const DataLayout & getDataLayout() const
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, bool isTargetGA=false, unsigned TargetFlags=0)
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
SDValue getRegister(unsigned Reg, EVT VT)
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
SDValue getBoolExtOrTrunc(SDValue Op, const SDLoc &SL, EVT VT, EVT OpVT)
Convert Op, which must be of integer type, to the integer type VT, by using an extension appropriate ...
SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
SDValue getStepVector(const SDLoc &DL, EVT ResVT, APInt StepVal)
Returns a vector of type ResVT whose elements contain the linear sequence <0, Step,...
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, uint64_t Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
LLVMContext * getContext() const
SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
void addCallSiteInfo(const SDNode *CallNode, CallSiteInfoImpl &&CallInfo)
SDValue getTargetInsertSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand, SDValue Subreg)
A convenience function for creating TargetInstrInfo::INSERT_SUBREG nodes.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
This instruction constructs a fixed permutation of two input vectors.
static bool isReverseMask(ArrayRef< int > Mask)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
VectorType * getType() const
Overload to return most specific vector type.
bool isZeroEltSplat() const
Return true if all elements of this shuffle are the same value as the first element of exactly one so...
static void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
static bool isSplatMask(const int *Mask, EVT VT)
int getMaskElt(unsigned Idx) const
StackOffset is a class to represent an offset with 2 dimensions, named fixed and scalable,...
Definition TypeSize.h:134
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:58
std::enable_if_t< std::numeric_limits< T >::is_signed, bool > getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition StringRef.h:510
LLVM_NODISCARD StringRef slice(size_t Start, size_t End) const
Return a reference to the substring from [Start, End).
Definition StringRef.h:732
LLVM_NODISCARD size_t size() const
size - Get the string size.
Definition StringRef.h:157
Class to represent struct types.
static StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:372
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
EVT getMemValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setTargetDAGCombine(ISD::NodeType NT)
Targets should invoke this method for each target independent node that they want to provide a custom...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
virtual Value * getSafeStackPointerLocation(IRBuilderBase &IRB) const
Returns the target-specific address of the unsafe stack pointer.
EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
void setIndexedStoreAction(unsigned IdxMode, MVT VT, LegalizeAction Action)
Indicate that the specified indexed store does or does not work with the specified type and indicate ...
virtual bool shouldLocalize(const MachineInstr &MI, const TargetTransformInfo *TTI) const
Check whether or not MI needs to be moved close to its uses.
void setMaximumJumpTableSize(unsigned)
Indicate the maximum number of entries in jump tables.
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual Value * getSDagStackGuard(const Module &M) const
Return the variable that's previously inserted by insertSSPDeclarations, if any, otherwise return nul...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
unsigned getMaximumJumpTableSize() const
Return upper limit for number of entries in a jump table.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual Function * getSSPStackGuardCheck(const Module &M) const
If the target has a standard stack protection check function that performs validation and error handl...
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
void setCondCodeAction(ISD::CondCode CC, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
virtual Value * getIRStackGuard(IRBuilderBase &IRB) const
If the target has a standard location for the stack protector guard, returns the address of that loca...
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setLibcallName(RTLIB::Libcall Call, const char *Name)
Rename the default libcall routine name for the specified libcall.
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const
Return true if it is profitable to reduce a load to a smaller type.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
virtual EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
MVT getFrameIndexTy(const DataLayout &DL) const
Return the type for frame index, which is determined by the alloca address space specified through th...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void insertSSPDeclarations(Module &M) const
Inserts necessary declarations for SSP (stack protection) purpose.
void setIndexedLoadAction(unsigned IdxMode, MVT VT, LegalizeAction Action)
Indicate that the specified indexed load does or does not work with the specified type and indicate w...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
virtual bool useLoadStackGuardNode() const
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
virtual void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
Primary interface to the complete machine description for the target machine.
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
const Triple & getTargetTriple() const
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
TargetOptions Options
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned TLSSize
Bit size of immediate TLS offsets (0 == use the default).
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:45
bool isOSMSVCRT() const
Is this a "Windows" OS targeting a "MSVCRT.dll" environment.
Definition Triple.h:584
bool isWindowsMSVCEnvironment() const
Checks if the environment could be MSVC.
Definition Triple.h:557
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:83
ScalarTy getFixedSize() const
Definition TypeSize.h:426
ScalarTy getKnownMinSize() const
Definition TypeSize.h:427
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:204
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:228
@ FloatTyID
32-bit floating point type
Definition Type.h:58
@ DoubleTyID
64-bit floating point type
Definition Type.h:59
static Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:186
static PointerType * getInt8PtrTy(LLVMContext &C, unsigned AS=0)
Definition Type.cpp:255
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:128
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:127
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
Definition Type.cpp:738
ScalarTy getValue() const
Definition TypeSize.h:232
A Use represents the edge between a Value definition and its users.
Definition Use.h:44
const Use & getOperandUse(unsigned i) const
Definition User.h:182
Value * getOperand(unsigned i) const
Definition User.h:169
unsigned getNumOperands() const
Definition User.h:191
This class is used to represent EVT's, which are used to parameterize some operations.
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
User * user_back()
Definition Value.h:408
Base class of all SIMD vector types.
Type * getElementType() const
Implementation for an ilist node.
Definition ilist_node.h:39
self_iterator getIterator()
Definition ilist_node.h:81
#define UINT64_MAX
Definition DataTypes.h:77
#define INT64_MAX
Definition DataTypes.h:71
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static CondCode getInvertedCondCode(CondCode Code)
static unsigned getNZCVToSatisfyCondCode(CondCode Code)
Given a condition code, return NZCV flags that would satisfy that condition.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_NC
MO_NC - Indicates whether the linker is expected to check the symbol reference for overflow.
@ MO_G1
MO_G1 - A symbol operand with this flag (granule 1) represents the bits 16-31 of a 64-bit address,...
@ MO_PAGEOFF
MO_PAGEOFF - A symbol operand with this flag represents the offset of that symbol within a 4K page.
@ MO_GOT
MO_GOT - This flag indicates that a symbol operand represents the address of the GOT entry for the sy...
@ MO_G0
MO_G0 - A symbol operand with this flag (granule 0) represents the bits 0-15 of a 64-bit address,...
@ MO_PAGE
MO_PAGE - A symbol operand with this flag represents the pc-relative offset of the 4K page containing...
@ MO_HI12
MO_HI12 - This flag indicates that a symbol operand represents the bits 13-24 of a 64-bit address,...
@ MO_TLS
MO_TLS - Indicates that the operand being accessed is some kind of thread-local symbol.
@ MO_G2
MO_G2 - A symbol operand with this flag (granule 2) represents the bits 32-47 of a 64-bit address,...
@ MO_G3
MO_G3 - A symbol operand with this flag (granule 3) represents the high 16-bits of a 64-bit address,...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
@ NVCAST
Natural vector cast.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
static uint8_t encodeAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType9(uint64_t Imm)
static bool isAdvSIMDModImmType4(uint64_t Imm)
static bool isAdvSIMDModImmType5(uint64_t Imm)
static int getFP32Imm(const APInt &Imm)
getFP32Imm - Return an 8-bit floating-point version of the 32-bit floating-point value.
static uint8_t encodeAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType10(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType9(uint64_t Imm)
static uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize)
encodeLogicalImmediate - Return the encoded immediate value for a logical immediate instruction of th...
static bool isAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType5(uint64_t Imm)
static int getFP64Imm(const APInt &Imm)
getFP64Imm - Return an 8-bit floating-point version of the 64-bit floating-point value.
static bool isAdvSIMDModImmType10(uint64_t Imm)
static int getFP16Imm(const APInt &Imm)
getFP16Imm - Return an 8-bit floating-point version of the 16-bit floating-point value.
static uint8_t encodeAdvSIMDModImmType8(uint64_t Imm)
static bool isAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType11(uint64_t Imm)
static bool isAdvSIMDModImmType11(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType6(uint64_t Imm)
static bool isAdvSIMDModImmType8(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType4(uint64_t Imm)
static bool isAdvSIMDModImmType6(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType1(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType1(uint64_t Imm)
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
const unsigned RoundingBitsPos
static constexpr unsigned SVEBitsPerBlock
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition BitmaskEnum.h:80
@ AArch64_SVE_VectorCall
Calling convention between AArch64 SVE functions.
@ CFGuard_Check
Special calling convention on Windows for calling the Control Guard Check ICall funtion.
Definition CallingConv.h:87
@ Fast
Fast - This calling convention attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:42
@ Tail
Tail - This calling convention attemps to make calls as fast as possible while guaranteeing that tail...
Definition CallingConv.h:81
@ Win64
The C convention as implemented on Windows/x86-64 and AArch64.
@ SwiftTail
SwiftTail - This follows the Swift calling convention in how arguments are passed but guarantees tail...
Definition CallingConv.h:92
@ C
C - The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:702
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:236
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition ISDOpcodes.h:462
@ FLT_ROUNDS_
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:825
@ VECREDUCE_SEQ_FADD
Generic reduction nodes.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:250
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:535
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:666
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:269
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:239
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition ISDOpcodes.h:921
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:732
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:466
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:199
@ GlobalAddress
Definition ISDOpcodes.h:78
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:739
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:519
@ VECREDUCE_FMAX
FMIN/FMAX nodes can have flags, for NaN/NoNaN variants.
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:377
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:640
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:255
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:848
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:229
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ GlobalTLSAddress
Definition ISDOpcodes.h:79
@ SET_ROUNDING
Set rounding mode.
Definition ISDOpcodes.h:830
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:726
@ STRICT_UINT_TO_FP
Definition ISDOpcodes.h:436
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:583
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition ISDOpcodes.h:101
@ VECREDUCE_FADD
These reductions have relaxed evaluation order semantics, and have a single vector operand.
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition ISDOpcodes.h:915
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition ISDOpcodes.h:866
@ BR_CC
BR_CC - Conditional branch.
Definition ISDOpcodes.h:963
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:314
@ BRIND
BRIND - Indirect branch.
Definition ISDOpcodes.h:942
@ BR_JT
BR_JT - Jumptable branch.
Definition ISDOpcodes.h:946
@ STEP_VECTOR
STEP_VECTOR(IMM) - Returns a scalable vector whose lanes are comprised of a linear sequence of unsign...
Definition ISDOpcodes.h:609
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:336
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:679
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:222
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition ISDOpcodes.h:590
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:310
@ VECREDUCE_ADD
Integer reductions may have a result type larger than the vector element type.
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:614
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:657
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:563
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:549
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:511
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:203
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:729
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:694
@ VSCALE
VSCALE(IMM) - Returns the runtime scaling factor used to calculate the number of elements within a sc...
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition ISDOpcodes.h:898
@ UBSANTRAP
UBSANTRAP - Trap with an immediate describing the kind of sanitizer failure.
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:318
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition ISDOpcodes.h:931
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:747
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:626
@ VECTOR_REVERSE
VECTOR_REVERSE(VECTOR) - Returns a vector, of the same type as VECTOR, whose elements are shuffled us...
Definition ISDOpcodes.h:554
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:833
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:688
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition ISDOpcodes.h:435
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:94
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:429
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:451
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:428
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition ISDOpcodes.h:911
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:785
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition ISDOpcodes.h:158
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:632
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:184
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:279
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:500
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:52
@ VECTOR_SPLICE
VECTOR_SPLICE(VEC1, VEC2, IMM) - Returns a subvector of the same type as VEC1/VEC2 from CONCAT_VECTOR...
Definition ISDOpcodes.h:575
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:814
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
Definition ISDOpcodes.h:106
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:800
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:735
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
@ BRCOND
BRCOND - Conditional branch.
Definition ISDOpcodes.h:956
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:715
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:61
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:476
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:327
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:192
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:491
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
bool isConstantSplatVectorAllZeros(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are 0 o...
CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
MemIndexType
MemIndexType enum - This enum defines how to interpret MGATHER/SCATTER's index parameter when calcula...
bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
static const int LAST_INDEXED_MODE
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Flag
These should be considered private to the implementation of the MCInstrDesc class.
bool match(Val *V, const Pattern &P)
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
match_combine_or< CastClass_match< OpTy, Instruction::ZExt >, CastClass_match< OpTy, Instruction::SExt > > m_ZExtOrSExt(const OpTy &Op)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
auto m_Undef()
Match an arbitrary undef constant.
Libcall
RTLIB::Libcall enum - This enum defines all of the runtime library calls the backend can emit.
initializer< Ty > init(const Ty &Val)
CodeModel::Model getCodeModel()
constexpr double e
Definition MathExtras.h:57
bool hasAttachedCallOpBundle(const CallBase *CB)
Definition ObjCARCUtil.h:34
---------------------— PointerInfo ------------------------------------—
bool RetCC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1554
bool CC_AArch64_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool operator==(uint64_t V1, const APInt &V2)
Definition APInt.h:2030
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition MathExtras.h:455
bool RetCC_AArch64_WebKit_JS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool CC_AArch64_DarwinPCS_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition Error.h:198
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
bool CC_AArch64_Win64_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:496
void shuffle(Iterator first, Iterator last, RNG &&g)
Definition STLExtras.h:1361
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:602
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:485
unsigned M1(unsigned Val)
Definition VE.h:372
bool isReleaseOrStronger(AtomicOrdering AO)
static Error getOffset(const SymbolRef &Sym, SectionRef Sec, uint64_t &Result)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1541
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:596
bool CC_AArch64_Win64_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
unsigned countLeadingZeros(T Val, ZeroBehavior ZB=ZB_Width)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition MathExtras.h:225
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:491
constexpr size_t array_lengthof(T(&)[N])
Find the length of an array.
Definition STLExtras.h:1377
unsigned countTrailingZeros(T Val, ZeroBehavior ZB=ZB_Width)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition MathExtras.h:156
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:163
ArrayRef< T > makeArrayRef(const T &OneElt)
Construct an ArrayRef from a single element.
Definition ArrayRef.h:476
LLVM_ATTRIBUTE_NORETURN void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition Error.cpp:140
bool CC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition MathExtras.h:473
EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Z
zlib style complession
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
CombineLevel
Definition DAGCombine.h:15
bool is_splat(R &&Range)
Wrapper function around std::equal to detect if all elements in a container are same.
Definition STLExtras.h:1701
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Xor
Bitwise or logical XOR of integers.
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:460
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:148
bool CC_AArch64_DarwinPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
constexpr uint64_t MinAlign(uint64_t A, uint64_t B)
A and B are either alignments or offsets.
Definition MathExtras.h:672
@ Invalid
Denotes invalid value.
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
bool isAcquireOrStronger(AtomicOrdering AO)
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr unsigned BitWidth
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1561
gep_type_iterator gep_type_begin(const User *GEP)
bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition STLExtras.h:1715
bool is_contained(R &&Range, const E &Element)
Wrapper function around std::find to detect if an element exists in a container.
Definition STLExtras.h:1599
bool CC_AArch64_DarwinPCS_ILP32_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< uint64_t > *Offsets=nullptr, uint64_t StartingOffset=0)
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:124
static const MachineMemOperand::Flags MOStridedAccess
bool CC_AArch64_WebKit_JS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:840
#define N
Helper structure to keep track of a SET_CC lowered into AArch64 code.
AArch64CC::CondCode CC
Helper structure to keep track of ISD::SET_CC operands.
Helper structure to be able to read SetCC information.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:85
Represent subnormal handling kind for floating point instruction inputs and outputs.
Extended Value Type.
Definition ValueTypes.h:35
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition ValueTypes.h:94
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:363
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:130
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:140
ElementCount getVectorElementCount() const
Definition ValueTypes.h:323
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:425
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:341
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition ValueTypes.h:332
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:353
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:432
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:289
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition ValueTypes.h:186
EVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:114
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:349
EVT widenIntegerVectorElementType(LLVMContext &Context) const
Return a VT for an integer vector type with the size of the elements doubled.
Definition ValueTypes.h:406
bool isFixedLengthVector() const
Definition ValueTypes.h:165
std::string getEVTString() const
This function returns value type as a string, e.g. "i32".
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:155
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:296
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition ValueTypes.h:161
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:301
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:150
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:309
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:415
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:145
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition ValueTypes.h:181
Describes a register that needs to be forwarded from the prologue to a musttail call.
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
static KnownBits commonBits(const KnownBits &LHS, const KnownBits &RHS)
Compute known bits common to LHS and RHS.
Definition KnownBits.h:289
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:40
Structure used to represent pair of argument number after call lowering and register used to transfer...
This class contains a discriminated union of information about pointers in memory operands,...
unsigned getAddrSpace() const
Return the LLVM IR address space number that this pointer points into.
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:109
Constraint for a predicate of the form "cmp Pred Op, OtherOp", where Op is the value the constraint a...
These are IR-level optimization flags that may be propagated to SDNodes.
void setAllowReassociation(bool b)
void setNoUnsignedWrap(bool b)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg If BaseGV is null...
This structure contains all information that is necessary for lowering calls.
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
Helper structure to keep track of SetCC information.
GenericSetCCInfo Generic
AArch64SetCCInfo AArch64